diff --git a/.readthedocs.yml b/.readthedocs.yml index b84167fa1c..ceac6c46a4 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -5,7 +5,7 @@ version: 2 python: - version: 3.6 + version: 3.9 install: - method: pip path: . diff --git a/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst b/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst index c615ad67aa..329788c41d 100644 --- a/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst +++ b/doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst @@ -243,16 +243,25 @@ TensorFlow API .. function:: smdistributed.dataparallel.tensorflow.allreduce(tensor, param_index, num_params, compression=Compression.none, op=ReduceOp.AVERAGE) - Performs an all-reduce operation on a tensor (``tf.Tensor``). + Performs an ``allreduce`` operation on a tensor (``tf.Tensor``). + + The ``smdistributed.dataparallel`` package's AllReduce API for TensorFlow to allreduce + gradient tensors. By default, ``smdistributed.dataparallel`` allreduce averages the + gradient tensors across participating workers. + + .. note:: + + :class:`smdistributed.dataparallel.tensorflow.allreduce()` should + only be used to allreduce gradient tensors. + For other (non-gradient) tensors, you must use + :class:`smdistributed.dataparallel.tensorflow.oob_allreduce()`. + If you use :class:`smdistributed.dataparallel.tensorflow.allreduce()` + for non-gradient tensors, + the distributed training job might stall or stop. - ``smdistributed.dataparallel`` AllReduce API can be used for all - reducing gradient tensors or any other tensors. By - default, ``smdistributed.dataparallel`` AllReduce averages the - tensors across the participating workers. - ​ **Inputs:** - - ``tensor (tf.Tensor)(required)``: The tensor to be all-reduced. The shape of the input must be identical across all ranks. + - ``tensor (tf.Tensor)(required)``: The tensor to be allreduced. The shape of the input must be identical across all ranks. - ``param_index (int)(required):`` 0 if you are reducing a single tensor. Index of the tensor if you are reducing a list of tensors. - ``num_params (int)(required):`` len(tensor). - ``compression (smdistributed.dataparallel.tensorflow.Compression)(optional)``: Compression algorithm used to reduce the amount of data sent and received by each worker node. Defaults to not using compression. @@ -306,9 +315,9 @@ TensorFlow API .. function:: smdistributed.dataparallel.tensorflow.oob_allreduce(tensor, compression=Compression.none, op=ReduceOp.AVERAGE) - OutOfBand (oob) AllReduce is simplified AllReduce function for use cases + Out-of-band (oob) AllReduce is simplified AllReduce function for use-cases such as calculating total loss across all the GPUs in the training. - oob_allreduce average the tensors, as reduction operation, across the + ``oob_allreduce`` average the tensors, as reduction operation, across the worker nodes. **Inputs:** @@ -326,15 +335,25 @@ TensorFlow API - ``None`` - .. rubric:: Notes - - ``smdistributed.dataparallel.tensorflow.oob_allreduce``, in most - cases, is ~2x slower - than ``smdistributed.dataparallel.tensorflow.allreduce``  so it is not - recommended to be used for performing gradient reduction during the - training - process. ``smdistributed.dataparallel.tensorflow.oob_allreduce`` internally - uses NCCL AllReduce with ``ncclSum`` as the reduction operation. + .. note:: + + In most cases, the :class:`smdistributed.dataparallel.tensorflow.oob_allreduce()` + function is ~2x slower + than :class:`smdistributed.dataparallel.tensorflow.allreduce()`. It is not + recommended to use the :class:`smdistributed.dataparallel.tensorflow.oob_allreduce()` + function for performing gradient + reduction during the training process. + ``smdistributed.dataparallel.tensorflow.oob_allreduce`` internally + uses NCCL AllReduce with ``ncclSum`` as the reduction operation. + + .. note:: + + :class:`smdistributed.dataparallel.tensorflow.oob_allreduce()` should + only be used to allreduce non-gradient tensors. + If you use :class:`smdistributed.dataparallel.tensorflow.allreduce()` + for non-gradient tensors, + the distributed training job might stall or stop. + To allreduce gradients, use :class:`smdistributed.dataparallel.tensorflow.allreduce()`. .. function:: smdistributed.dataparallel.tensorflow.overlap(tensor) diff --git a/doc/conf.py b/doc/conf.py index a866c7292b..5e2c991937 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -10,7 +10,7 @@ # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. -"""Placeholder docstring""" +"""Configuration for generating readthedocs docstrings.""" from __future__ import absolute_import import pkg_resources diff --git a/doc/overview.rst b/doc/overview.rst index df320e3b47..ffc74e3e52 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -773,11 +773,10 @@ Deployment may take about 5 minutes.    instance_type=instance_type, ) -Because ``catboost`` and ``lightgbm`` rely on the PyTorch Deep Learning Containers -image, the corresponding Models and Endpoints display the “pytorch” -prefix when viewed in the AWS console. To verify that these models -were created successfully with your desired base model, refer to -the ``Tags`` section. +Because the model and script URIs are distributed by SageMaker JumpStart, +the endpoint, endpoint config and model resources will be prefixed with +``sagemaker-jumpstart``. Refer to the model ``Tags`` to inspect the +JumpStart artifacts involved in the model creation. Perform Inference ----------------- diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index fd74633584..999f5d10f0 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -50,6 +50,7 @@ from sagemaker.job import _Job from sagemaker.jumpstart.utils import ( add_jumpstart_tags, + get_jumpstart_base_name_if_jumpstart_model, update_inference_tags_with_jumpstart_training_tags, ) from sagemaker.local import LocalSession @@ -569,8 +570,11 @@ def prepare_workflow_for_training(self, job_name=None): def _ensure_base_job_name(self): """Set ``self.base_job_name`` if it is not set already.""" # honor supplied base_job_name or generate it - if self.base_job_name is None: - self.base_job_name = base_name_from_image(self.training_image_uri()) + self.base_job_name = ( + self.base_job_name + or get_jumpstart_base_name_if_jumpstart_model(self.source_dir, self.model_uri) + or base_name_from_image(self.training_image_uri()) + ) def _get_or_create_name(self, name=None): """Generate a name based on the base job name or training image if needed. @@ -1208,7 +1212,15 @@ def deploy( is_serverless = serverless_inference_config is not None self._ensure_latest_training_job() self._ensure_base_job_name() - default_name = name_from_base(self.base_job_name) + + jumpstart_base_name = get_jumpstart_base_name_if_jumpstart_model( + kwargs.get("source_dir"), self.source_dir, kwargs.get("model_data"), self.model_uri + ) + default_name = ( + name_from_base(jumpstart_base_name) + if jumpstart_base_name + else name_from_base(self.base_job_name) + ) endpoint_name = endpoint_name or default_name model_name = model_name or default_name diff --git a/src/sagemaker/huggingface/estimator.py b/src/sagemaker/huggingface/estimator.py index 9d154d7183..81b24b5aa3 100644 --- a/src/sagemaker/huggingface/estimator.py +++ b/src/sagemaker/huggingface/estimator.py @@ -50,14 +50,15 @@ def __init__( compiler_config=None, **kwargs, ): - """This ``Estimator`` executes a HuggingFace script in a managed execution environment. + """This estimator runs a Hugging Face training script in a SageMaker training environment. - The managed HuggingFace environment is an Amazon-built Docker container that executes - functions defined in the supplied ``entry_point`` Python script within a SageMaker - Training Job. + The estimator initiates the SageMaker-managed Hugging Face environment + by using the pre-built Hugging Face Docker container and runs + the Hugging Face training script that user provides through + the ``entry_point`` argument. - Training is started by calling - :meth:`~sagemaker.amazon.estimator.Framework.fit` on this Estimator. + After configuring the estimator class, use the class method + :meth:`~sagemaker.amazon.estimator.Framework.fit()` to start a training job. Args: py_version (str): Python version you want to use for executing your model training diff --git a/src/sagemaker/image_uri_config/neo-tensorflow.json b/src/sagemaker/image_uri_config/neo-tensorflow.json index 912ccbfc43..8fe391b43d 100644 --- a/src/sagemaker/image_uri_config/neo-tensorflow.json +++ b/src/sagemaker/image_uri_config/neo-tensorflow.json @@ -12,7 +12,8 @@ "1.11.0": "1.15.3", "1.12.0": "1.15.3", "1.13.0": "1.15.3", - "1.14.0": "1.15.3" + "1.14.0": "1.15.3", + "2.4.2": "2.4.2" }, "versions": { "1.15.3": { @@ -44,6 +45,36 @@ "us-west-2": "301217895009" }, "repository": "sagemaker-inference-tensorflow" + }, + "2.4.2": { + "py_versions": ["py3"], + "registries": { + "af-south-1": "774647643957", + "ap-east-1": "110948597952", + "ap-northeast-1": "941853720454", + "ap-northeast-2": "151534178276", + "ap-northeast-3": "925152966179", + "ap-south-1": "763008648453", + "ap-southeast-1": "324986816169", + "ap-southeast-2": "355873309152", + "ca-central-1": "464438896020", + "cn-north-1": "472730292857", + "cn-northwest-1": "474822919863", + "eu-central-1": "746233611703", + "eu-north-1": "601324751636", + "eu-south-1": "966458181534", + "eu-west-1": "802834080501", + "eu-west-2": "205493899709", + "eu-west-3": "254080097072", + "me-south-1": "836785723513", + "sa-east-1": "756306329178", + "us-east-1": "785573368785", + "us-east-2": "007439368137", + "us-gov-west-1": "263933020539", + "us-west-1": "710691900526", + "us-west-2": "301217895009" + }, + "repository": "sagemaker-inference-tensorflow" } } } diff --git a/src/sagemaker/jumpstart/cache.py b/src/sagemaker/jumpstart/cache.py index 25d3b37fcb..cf9df91519 100644 --- a/src/sagemaker/jumpstart/cache.py +++ b/src/sagemaker/jumpstart/cache.py @@ -229,7 +229,7 @@ def _get_manifest_key_from_model_id_semantic_version( ) else: - possible_model_ids = [header.model_id for header in manifest.values()] + possible_model_ids = [header.model_id for header in manifest.values()] # type: ignore closest_model_id = get_close_matches(model_id, possible_model_ids, n=1, cutoff=0)[0] error_msg += f"Did you mean to use model ID '{closest_model_id}'?" diff --git a/src/sagemaker/jumpstart/constants.py b/src/sagemaker/jumpstart/constants.py index a748beac89..2b0fb4ee12 100644 --- a/src/sagemaker/jumpstart/constants.py +++ b/src/sagemaker/jumpstart/constants.py @@ -124,3 +124,5 @@ SUPPORTED_JUMPSTART_SCOPES = set(scope.value for scope in JumpStartScriptScope) ENV_VARIABLE_JUMPSTART_CONTENT_BUCKET_OVERRIDE = "AWS_JUMPSTART_CONTENT_BUCKET_OVERRIDE" + +JUMPSTART_RESOURCE_BASE_NAME = "sagemaker-jumpstart" diff --git a/src/sagemaker/jumpstart/utils.py b/src/sagemaker/jumpstart/utils.py index c59966d1b5..e72b04b372 100644 --- a/src/sagemaker/jumpstart/utils.py +++ b/src/sagemaker/jumpstart/utils.py @@ -232,6 +232,22 @@ def add_single_jumpstart_tag( return curr_tags +def get_jumpstart_base_name_if_jumpstart_model( + *uris: Optional[str], +) -> Optional[str]: + """Return default JumpStart base name if a URI belongs to JumpStart. + + If no URIs belong to JumpStart, return None. + + Args: + *uris (Optional[str]): URI to test for association with JumpStart. + """ + for uri in uris: + if is_jumpstart_model_uri(uri): + return constants.JUMPSTART_RESOURCE_BASE_NAME + return None + + def add_jumpstart_tags( tags: Optional[List[Dict[str, str]]] = None, inference_model_uri: Optional[str] = None, diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index 2d01bb4c0f..b8c6b4ea22 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -33,7 +33,7 @@ from sagemaker.predictor import PredictorBase from sagemaker.serverless import ServerlessInferenceConfig from sagemaker.transformer import Transformer -from sagemaker.jumpstart.utils import add_jumpstart_tags +from sagemaker.jumpstart.utils import add_jumpstart_tags, get_jumpstart_base_name_if_jumpstart_model from sagemaker.utils import unique_name_from_base from sagemaker.async_inference import AsyncInferenceConfig from sagemaker.predictor_async import AsyncPredictor @@ -466,7 +466,7 @@ def _upload_code(self, key_prefix: str, repack: bool = False) -> None: ) def _script_mode_env_vars(self): - """Placeholder docstring""" + """Returns a mapping of environment variables for script mode execution""" script_name = None dir_name = None if self.uploaded_code: @@ -478,8 +478,11 @@ def _script_mode_env_vars(self): elif self.entry_point is not None: script_name = self.entry_point if self.source_dir is not None: - dir_name = "file://" + self.source_dir - + dir_name = ( + self.source_dir + if self.source_dir.startswith("s3://") + else "file://" + self.source_dir + ) return { SCRIPT_PARAM_NAME.upper(): script_name or str(), DIR_PARAM_NAME.upper(): dir_name or str(), @@ -514,7 +517,9 @@ def _create_sagemaker_model(self, instance_type=None, accelerator_type=None, tag """ container_def = self.prepare_container_def(instance_type, accelerator_type=accelerator_type) - self._ensure_base_name_if_needed(container_def["Image"]) + self._ensure_base_name_if_needed( + image_uri=container_def["Image"], script_uri=self.source_dir, model_uri=self.model_data + ) self._set_model_name_if_needed() enable_network_isolation = self.enable_network_isolation() @@ -529,10 +534,17 @@ def _create_sagemaker_model(self, instance_type=None, accelerator_type=None, tag tags=tags, ) - def _ensure_base_name_if_needed(self, image_uri): - """Create a base name from the image URI if there is no model name provided.""" + def _ensure_base_name_if_needed(self, image_uri, script_uri, model_uri): + """Create a base name from the image URI if there is no model name provided. + + If a JumpStart script or model uri is used, select the JumpStart base name. + """ if self.name is None: - self._base_name = self._base_name or utils.base_name_from_image(image_uri) + self._base_name = ( + self._base_name + or get_jumpstart_base_name_if_jumpstart_model(script_uri, model_uri) + or utils.base_name_from_image(image_uri) + ) def _set_model_name_if_needed(self): """Generate a new model name if ``self._base_name`` is present.""" @@ -963,7 +975,9 @@ def deploy( compiled_model_suffix = None if is_serverless else "-".join(instance_type.split(".")[:-1]) if self._is_compiled_model and not is_serverless: - self._ensure_base_name_if_needed(self.image_uri) + self._ensure_base_name_if_needed( + image_uri=self.image_uri, script_uri=self.source_dir, model_uri=self.model_data + ) if self._base_name is not None: self._base_name = "-".join((self._base_name, compiled_model_suffix)) diff --git a/src/sagemaker/serializers.py b/src/sagemaker/serializers.py index 727912a33c..3055fb5c18 100644 --- a/src/sagemaker/serializers.py +++ b/src/sagemaker/serializers.py @@ -18,7 +18,6 @@ import csv import io import json - import numpy as np from six import with_metaclass @@ -357,3 +356,37 @@ def serialize(self, data): return data.read() raise ValueError("Unable to handle input format: %s" % type(data)) + + +class DataSerializer(SimpleBaseSerializer): + """Serialize data in any file by extracting raw bytes from the file.""" + + def __init__(self, content_type="file-path/raw-bytes"): + """Initialize a ``DataSerializer`` instance. + + Args: + content_type (str): The MIME type to signal to the inference endpoint when sending + request data (default: "file-path/raw-bytes"). + """ + super(DataSerializer, self).__init__(content_type=content_type) + + def serialize(self, data): + """Serialize file data to a raw bytes. + + Args: + data (object): Data to be serialized. The data can be a string + representing file-path or the raw bytes from a file. + Returns: + raw-bytes: The data serialized as raw-bytes from the input. + """ + if isinstance(data, str): + try: + with open(data, "rb") as data_file: + data_file_info = data_file.read() + return data_file_info + except Exception as e: + raise ValueError(f"Could not open/read file: {data}. {e}") + if isinstance(data, bytes): + return data + + raise ValueError(f"Object of type {type(data)} is not Data serializable.") diff --git a/src/sagemaker/training_compiler/config.py b/src/sagemaker/training_compiler/config.py index 0659c43507..c45fa4cdaf 100644 --- a/src/sagemaker/training_compiler/config.py +++ b/src/sagemaker/training_compiler/config.py @@ -18,11 +18,7 @@ class TrainingCompilerConfig(object): - """The configuration class for accelerating SageMaker training jobs through compilation. - - SageMaker Training Compiler speeds up training by optimizing the model execution graph. - - """ + """The SageMaker Training Compiler configuration class.""" DEBUG_PATH = "/opt/ml/output/data/compiler/" SUPPORTED_INSTANCE_CLASS_PREFIXES = ["p3", "g4dn", "p4"] @@ -37,9 +33,15 @@ def __init__( ): """This class initializes a ``TrainingCompilerConfig`` instance. - Pass the output of it to the ``compiler_config`` + `Amazon SageMaker Training Compiler + `_ + is a feature of SageMaker Training + and speeds up training jobs by optimizing model execution graphs. + + You can compile Hugging Face models + by passing the object of this configuration class to the ``compiler_config`` parameter of the :class:`~sagemaker.huggingface.HuggingFace` - class. + estimator. Args: enabled (bool): Optional. Switch to enable SageMaker Training Compiler. @@ -48,13 +50,28 @@ def __init__( This comes with a potential performance slowdown. The default is ``False``. - **Example**: The following example shows the basic ``compiler_config`` - parameter configuration, enabling compilation with default parameter values. + **Example**: The following code shows the basic usage of the + :class:`sagemaker.huggingface.TrainingCompilerConfig()` class + to run a HuggingFace training job with the compiler. .. code-block:: python - from sagemaker.huggingface import TrainingCompilerConfig - compiler_config = TrainingCompilerConfig() + from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig + + huggingface_estimator=HuggingFace( + ... + compiler_config=TrainingCompilerConfig() + ) + + .. seealso:: + + For more information about how to enable SageMaker Training Compiler + for various training settings such as using TensorFlow-based models, + PyTorch-based models, and distributed training, + see `Enable SageMaker Training Compiler + `_ + in the `Amazon SageMaker Training Compiler developer guide + `_. """ diff --git a/src/sagemaker/workflow/steps.py b/src/sagemaker/workflow/steps.py index 99f3444f23..1ef63ef915 100644 --- a/src/sagemaker/workflow/steps.py +++ b/src/sagemaker/workflow/steps.py @@ -301,6 +301,8 @@ def arguments(self) -> RequestType: ) request_dict = self.estimator.sagemaker_session._get_train_request(**train_args) request_dict.pop("TrainingJobName") + if "HyperParameters" in request_dict: + request_dict["HyperParameters"].pop("sagemaker_job_name", None) return request_dict diff --git a/tests/data/cuteCat.raw b/tests/data/cuteCat.raw new file mode 100644 index 0000000000..6e89b9d78f Binary files /dev/null and b/tests/data/cuteCat.raw differ diff --git a/tests/integ/__init__.py b/tests/integ/__init__.py index 678e6c3eb1..b7ed6cb41e 100644 --- a/tests/integ/__init__.py +++ b/tests/integ/__init__.py @@ -148,12 +148,6 @@ "eu-west-2", "us-east-1", ] -NO_SM_PIPELINE_MM_CLARIFY_CHECK_STEP_REGIONS = [ - "ap-northeast-3", - "ap-south-1", - "eu-north-1", - "sa-east-1", -] EDGE_PACKAGING_SUPPORTED_REGIONS = [ "us-east-2", "us-west-2", diff --git a/tests/integ/sagemaker/jumpstart/script_mode_class/test_transfer_learning.py b/tests/integ/sagemaker/jumpstart/script_mode_class/test_transfer_learning.py index 29b16cf9a5..b5c82eece1 100644 --- a/tests/integ/sagemaker/jumpstart/script_mode_class/test_transfer_learning.py +++ b/tests/integ/sagemaker/jumpstart/script_mode_class/test_transfer_learning.py @@ -21,7 +21,7 @@ TRAINING_ENTRY_POINT_SCRIPT_NAME, ) from sagemaker.jumpstart.utils import get_jumpstart_content_bucket -from sagemaker.utils import name_from_base +from sagemaker.predictor import Predictor from tests.integ.sagemaker.jumpstart.constants import ( ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID, JUMPSTART_TAG, @@ -106,19 +106,17 @@ def test_jumpstart_transfer_learning_estimator_class(setup): model_id=model_id, model_version=model_version, model_scope="inference" ) - endpoint_name = name_from_base(f"{model_id}-transfer-learning") - - estimator.deploy( + predictor: Predictor = estimator.deploy( initial_instance_count=instance_count, instance_type=inference_instance_type, entry_point=INFERENCE_ENTRY_POINT_SCRIPT_NAME, image_uri=image_uri, source_dir=script_uri, - endpoint_name=endpoint_name, + tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}], ) endpoint_invoker = EndpointInvoker( - endpoint_name=endpoint_name, + endpoint_name=predictor.endpoint_name, ) response = endpoint_invoker.invoke_spc_endpoint(["hello", "world"]) diff --git a/tests/integ/sagemaker/lineage/conftest.py b/tests/integ/sagemaker/lineage/conftest.py index 4ede5c193d..8922c011f7 100644 --- a/tests/integ/sagemaker/lineage/conftest.py +++ b/tests/integ/sagemaker/lineage/conftest.py @@ -26,7 +26,9 @@ artifact, ) from sagemaker.model import ModelPackage -from tests.integ.test_workflow import test_end_to_end_pipeline_successful_execution +from tests.integ.sagemaker.workflow.test_workflow import ( + test_end_to_end_pipeline_successful_execution, +) from sagemaker.workflow.pipeline import _PipelineExecution from sagemaker.session import get_execution_role from smexperiments import trial_component, trial, experiment diff --git a/tests/integ/sagemaker/workflow/__init__.py b/tests/integ/sagemaker/workflow/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integ/sagemaker/workflow/test_callback_steps.py b/tests/integ/sagemaker/workflow/test_callback_steps.py new file mode 100644 index 0000000000..c56ff7c120 --- /dev/null +++ b/tests/integ/sagemaker/workflow/test_callback_steps.py @@ -0,0 +1,118 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import re + +import pytest + +from sagemaker import get_execution_role, utils +from sagemaker.workflow.callback_step import CallbackOutput, CallbackStep, CallbackOutputTypeEnum +from sagemaker.workflow.parameters import ParameterInteger +from sagemaker.workflow.pipeline import Pipeline + + +@pytest.fixture +def role(sagemaker_session): + return get_execution_role(sagemaker_session) + + +@pytest.fixture +def pipeline_name(): + return utils.unique_name_from_base("my-pipeline-callback") + + +@pytest.fixture +def region_name(sagemaker_session): + return sagemaker_session.boto_session.region_name + + +def test_one_step_callback_pipeline(sagemaker_session, role, pipeline_name, region_name): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + + outputParam1 = CallbackOutput(output_name="output1", output_type=CallbackOutputTypeEnum.String) + step_callback = CallbackStep( + name="callback-step", + sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", + inputs={"arg1": "foo"}, + outputs=[outputParam1], + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=[step_callback], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] + response = pipeline.update(role) + update_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + update_arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_two_step_callback_pipeline_with_output_reference( + sagemaker_session, role, pipeline_name, region_name +): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + + outputParam1 = CallbackOutput(output_name="output1", output_type=CallbackOutputTypeEnum.String) + step_callback1 = CallbackStep( + name="callback-step1", + sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", + inputs={"arg1": "foo"}, + outputs=[outputParam1], + ) + + step_callback2 = CallbackStep( + name="callback-step2", + sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", + inputs={"arg1": outputParam1}, + outputs=[], + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=[step_callback1, step_callback2], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass diff --git a/tests/integ/test_workflow_with_clarify_check_steps.py b/tests/integ/sagemaker/workflow/test_clarify_check_steps.py similarity index 95% rename from tests/integ/test_workflow_with_clarify_check_steps.py rename to tests/integ/sagemaker/workflow/test_clarify_check_steps.py index aa3650d666..b0d4ac6cbb 100644 --- a/tests/integ/test_workflow_with_clarify_check_steps.py +++ b/tests/integ/sagemaker/workflow/test_clarify_check_steps.py @@ -19,7 +19,6 @@ import pytest from botocore.exceptions import WaiterError -import tests from sagemaker.clarify import ( BiasConfig, DataConfig, @@ -129,10 +128,6 @@ def data_bias_check_config(data_config, bias_config): ) -@pytest.mark.skipif( - tests.integ.test_region() in tests.integ.NO_SM_PIPELINE_MM_CLARIFY_CHECK_STEP_REGIONS, - reason=f"ClarifyCheckStep is not fully deployed in {tests.integ.test_region()}.", -) def test_one_step_data_bias_pipeline_happycase( sagemaker_session, role, @@ -220,10 +215,6 @@ def test_one_step_data_bias_pipeline_happycase( pass -@pytest.mark.skipif( - tests.integ.test_region() in tests.integ.NO_SM_PIPELINE_MM_CLARIFY_CHECK_STEP_REGIONS, - reason=f"ClarifyCheckStep is not fully deployed in {tests.integ.test_region()}.", -) def test_one_step_data_bias_pipeline_constraint_violation( sagemaker_session, role, diff --git a/tests/integ/sagemaker/workflow/test_emr_steps.py b/tests/integ/sagemaker/workflow/test_emr_steps.py new file mode 100644 index 0000000000..fde6163fe6 --- /dev/null +++ b/tests/integ/sagemaker/workflow/test_emr_steps.py @@ -0,0 +1,82 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import re + +import pytest + +from sagemaker import get_execution_role, utils +from sagemaker.workflow.emr_step import EMRStep, EMRStepConfig +from sagemaker.workflow.parameters import ParameterInteger +from sagemaker.workflow.pipeline import Pipeline + + +@pytest.fixture +def role(sagemaker_session): + return get_execution_role(sagemaker_session) + + +@pytest.fixture +def pipeline_name(): + return utils.unique_name_from_base("my-pipeline-emr") + + +@pytest.fixture +def region_name(sagemaker_session): + return sagemaker_session.boto_session.region_name + + +def test_two_steps_emr_pipeline(sagemaker_session, role, pipeline_name, region_name): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + + emr_step_config = EMRStepConfig( + jar="s3://us-west-2.elasticmapreduce/libs/script-runner/script-runner.jar", + args=["dummy_emr_script_path"], + ) + + step_emr_1 = EMRStep( + name="emr-step-1", + cluster_id="j-1YONHTCP3YZKC", + display_name="emr_step_1", + description="MyEMRStepDescription", + step_config=emr_step_config, + ) + + step_emr_2 = EMRStep( + name="emr-step-2", + cluster_id=step_emr_1.properties.ClusterId, + display_name="emr_step_2", + description="MyEMRStepDescription", + step_config=emr_step_config, + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=[step_emr_1, step_emr_2], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass diff --git a/tests/integ/test_workflow_experiment.py b/tests/integ/sagemaker/workflow/test_experiment.py similarity index 100% rename from tests/integ/test_workflow_experiment.py rename to tests/integ/sagemaker/workflow/test_experiment.py diff --git a/tests/integ/test_workflow_with_fail_steps.py b/tests/integ/sagemaker/workflow/test_fail_steps.py similarity index 100% rename from tests/integ/test_workflow_with_fail_steps.py rename to tests/integ/sagemaker/workflow/test_fail_steps.py diff --git a/tests/integ/sagemaker/workflow/test_lambda_steps.py b/tests/integ/sagemaker/workflow/test_lambda_steps.py new file mode 100644 index 0000000000..25abce6bc7 --- /dev/null +++ b/tests/integ/sagemaker/workflow/test_lambda_steps.py @@ -0,0 +1,132 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import re + +import pytest + +from sagemaker import get_execution_role, utils +from sagemaker.workflow.lambda_step import ( + LambdaStep, + LambdaOutput, + LambdaOutputTypeEnum, +) +from sagemaker.lambda_helper import Lambda +from sagemaker.workflow.parameters import ParameterInteger +from sagemaker.workflow.pipeline import Pipeline + + +@pytest.fixture +def role(sagemaker_session): + return get_execution_role(sagemaker_session) + + +@pytest.fixture +def pipeline_name(): + return utils.unique_name_from_base("my-pipeline-lambda") + + +@pytest.fixture +def region_name(sagemaker_session): + return sagemaker_session.boto_session.region_name + + +def test_one_step_lambda_pipeline(sagemaker_session, role, pipeline_name, region_name): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + + outputParam1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String) + step_lambda = LambdaStep( + name="lambda-step", + lambda_func=Lambda( + function_arn=("arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda"), + session=sagemaker_session, + ), + inputs={"arg1": "foo"}, + outputs=[outputParam1], + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=[step_lambda], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] + response = pipeline.update(role) + update_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + update_arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_two_step_lambda_pipeline_with_output_reference( + sagemaker_session, role, pipeline_name, region_name +): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + + outputParam1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String) + step_lambda1 = LambdaStep( + name="lambda-step1", + lambda_func=Lambda( + function_arn=("arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda"), + session=sagemaker_session, + ), + inputs={"arg1": "foo"}, + outputs=[outputParam1], + ) + + step_lambda2 = LambdaStep( + name="lambda-step2", + lambda_func=Lambda( + function_arn=("arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda"), + session=sagemaker_session, + ), + inputs={"arg1": outputParam1}, + outputs=[], + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=[step_lambda1, step_lambda2], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass diff --git a/tests/integ/sagemaker/workflow/test_model_registration.py b/tests/integ/sagemaker/workflow/test_model_registration.py new file mode 100644 index 0000000000..193bbb9755 --- /dev/null +++ b/tests/integ/sagemaker/workflow/test_model_registration.py @@ -0,0 +1,747 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import logging +import os +import re + +import pytest +from botocore.exceptions import WaiterError + +import tests +from tests.integ.retry import retries +from sagemaker.drift_check_baselines import DriftCheckBaselines +from sagemaker import ( + PipelineModel, + TrainingInput, + Model, + ModelMetrics, + MetricsSource, + get_execution_role, +) +from sagemaker import FileSource, utils +from sagemaker.inputs import CreateModelInput +from sagemaker.processing import ProcessingInput, ProcessingOutput +from sagemaker.pytorch import PyTorch +from sagemaker.s3 import S3Uploader +from sagemaker.sklearn import SKLearnModel, SKLearnProcessor +from sagemaker.mxnet.model import MXNetModel +from sagemaker.workflow.condition_step import ConditionStep +from sagemaker.workflow.parameters import ParameterInteger, ParameterString +from sagemaker.workflow.pipeline import Pipeline +from sagemaker.workflow.step_collections import RegisterModel +from sagemaker.workflow.steps import CreateModelStep, ProcessingStep, TrainingStep +from sagemaker.xgboost import XGBoostModel +from sagemaker.xgboost import XGBoost +from sagemaker.workflow.conditions import ( + ConditionGreaterThanOrEqualTo, + ConditionIn, +) +from tests.integ.kms_utils import get_or_create_kms_key +from tests.integ import DATA_DIR + + +@pytest.fixture +def role(sagemaker_session): + return get_execution_role(sagemaker_session) + + +@pytest.fixture +def pipeline_name(): + return utils.unique_name_from_base("my-pipeline-model-regis") + + +@pytest.fixture +def region_name(sagemaker_session): + return sagemaker_session.boto_session.region_name + + +def test_conditional_pytorch_training_model_registration( + sagemaker_session, + role, + cpu_instance_type, + pipeline_name, + region_name, +): + base_dir = os.path.join(DATA_DIR, "pytorch_mnist") + entry_point = os.path.join(base_dir, "mnist.py") + input_path = sagemaker_session.upload_data( + path=os.path.join(base_dir, "training"), + key_prefix="integ-test-data/pytorch_mnist/training", + ) + inputs = TrainingInput(s3_data=input_path) + + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + good_enough_input = ParameterInteger(name="GoodEnoughInput", default_value=1) + in_condition_input = ParameterString(name="Foo", default_value="Foo") + + pytorch_estimator = PyTorch( + entry_point=entry_point, + role=role, + framework_version="1.5.0", + py_version="py3", + instance_count=instance_count, + instance_type=instance_type, + sagemaker_session=sagemaker_session, + ) + step_train = TrainingStep( + name="pytorch-train", + estimator=pytorch_estimator, + inputs=inputs, + ) + + step_register = RegisterModel( + name="pytorch-register-model", + estimator=pytorch_estimator, + model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, + content_types=["*"], + response_types=["*"], + inference_instances=["*"], + transform_instances=["*"], + description="test-description", + ) + + model = Model( + image_uri=pytorch_estimator.training_image_uri(), + model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, + sagemaker_session=sagemaker_session, + role=role, + ) + model_inputs = CreateModelInput( + instance_type="ml.m5.large", + accelerator_type="ml.eia1.medium", + ) + step_model = CreateModelStep( + name="pytorch-model", + model=model, + inputs=model_inputs, + ) + + step_cond = ConditionStep( + name="cond-good-enough", + conditions=[ + ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1), + ConditionIn(value=in_condition_input, in_values=["foo", "bar"]), + ], + if_steps=[step_train, step_register], + else_steps=[step_model], + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[ + in_condition_input, + good_enough_input, + instance_count, + instance_type, + ], + steps=[step_cond], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + + execution = pipeline.start(parameters={"GoodEnoughInput": 0}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_mxnet_model_registration( + sagemaker_session, + role, + cpu_instance_type, + pipeline_name, + region_name, +): + base_dir = os.path.join(DATA_DIR, "mxnet_mnist") + source_dir = os.path.join(base_dir, "code") + entry_point = os.path.join(source_dir, "inference.py") + mx_mnist_model_data = os.path.join(base_dir, "model.tar.gz") + + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + + model = MXNetModel( + entry_point=entry_point, + source_dir=source_dir, + role=role, + model_data=mx_mnist_model_data, + framework_version="1.7.0", + py_version="py3", + sagemaker_session=sagemaker_session, + ) + + step_register = RegisterModel( + name="mxnet-register-model", + model=model, + content_types=["*"], + response_types=["*"], + inference_instances=["ml.m5.xlarge"], + transform_instances=["*"], + description="test-description", + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count, instance_type], + steps=[step_register], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + + execution = pipeline.start() + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_sklearn_xgboost_sip_model_registration( + sagemaker_session, role, pipeline_name, region_name +): + prefix = "sip" + bucket_name = sagemaker_session.default_bucket() + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + + sklearn_processor = SKLearnProcessor( + role=role, + instance_type=instance_type, + instance_count=instance_count, + framework_version="0.20.0", + sagemaker_session=sagemaker_session, + ) + + # The path to the raw data. + raw_data_path = "s3://{0}/{1}/data/raw/".format(bucket_name, prefix) + raw_data_path_param = ParameterString(name="raw_data_path", default_value=raw_data_path) + + # The output path to the training data. + train_data_path = "s3://{0}/{1}/data/preprocessed/train/".format(bucket_name, prefix) + train_data_path_param = ParameterString(name="train_data_path", default_value=train_data_path) + + # The output path to the validation data. + val_data_path = "s3://{0}/{1}/data/preprocessed/val/".format(bucket_name, prefix) + val_data_path_param = ParameterString(name="val_data_path", default_value=val_data_path) + + # The training output path for the model. + output_path = "s3://{0}/{1}/output/".format(bucket_name, prefix) + output_path_param = ParameterString(name="output_path", default_value=output_path) + + # The output path to the featurizer model. + model_path = "s3://{0}/{1}/output/sklearn/".format(bucket_name, prefix) + model_path_param = ParameterString(name="model_path", default_value=model_path) + + inputs = [ + ProcessingInput( + input_name="raw_data", + source=raw_data_path_param, + destination="/opt/ml/processing/input", + ) + ] + + outputs = [ + ProcessingOutput( + output_name="train_data", + source="/opt/ml/processing/train", + destination=train_data_path_param, + ), + ProcessingOutput( + output_name="val_data", + source="/opt/ml/processing/val", + destination=val_data_path_param, + ), + ProcessingOutput( + output_name="model", + source="/opt/ml/processing/model", + destination=model_path_param, + ), + ] + + base_dir = os.path.join(DATA_DIR, "sip") + code_path = os.path.join(base_dir, "preprocessor.py") + + processing_step = ProcessingStep( + name="Processing", + code=code_path, + processor=sklearn_processor, + inputs=inputs, + outputs=outputs, + job_arguments=["--train-test-split-ratio", "0.2"], + ) + + entry_point = "training.py" + source_dir = base_dir + code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) + + estimator = XGBoost( + entry_point=entry_point, + source_dir=source_dir, + output_path=output_path_param, + code_location=code_location, + instance_type=instance_type, + instance_count=instance_count, + framework_version="0.90-2", + sagemaker_session=sagemaker_session, + py_version="py3", + role=role, + ) + + training_step = TrainingStep( + name="Training", + estimator=estimator, + inputs={ + "train": TrainingInput( + s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[ + "train_data" + ].S3Output.S3Uri, + content_type="text/csv", + ), + "validation": TrainingInput( + s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[ + "val_data" + ].S3Output.S3Uri, + content_type="text/csv", + ), + }, + ) + + code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) + source_dir = os.path.join(base_dir, "sklearn_source_dir") + + sklearn_model = SKLearnModel( + name="sklearn-model", + model_data=processing_step.properties.ProcessingOutputConfig.Outputs[ + "model" + ].S3Output.S3Uri, + entry_point="inference.py", + source_dir=source_dir, + code_location=code_location, + role=role, + sagemaker_session=sagemaker_session, + framework_version="0.20.0", + py_version="py3", + ) + + code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) + source_dir = os.path.join(base_dir, "xgboost_source_dir") + + xgboost_model = XGBoostModel( + name="xgboost-model", + model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, + entry_point="inference.py", + source_dir=source_dir, + code_location=code_location, + framework_version="0.90-2", + py_version="py3", + role=role, + sagemaker_session=sagemaker_session, + ) + + pipeline_model = PipelineModel( + [xgboost_model, sklearn_model], role, sagemaker_session=sagemaker_session + ) + + step_register = RegisterModel( + name="AbaloneRegisterModel", + model=pipeline_model, + content_types=["application/json"], + response_types=["application/json"], + inference_instances=["ml.t2.medium", "ml.m5.xlarge"], + transform_instances=["ml.m5.xlarge"], + model_package_group_name="windturbine", + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[ + raw_data_path_param, + train_data_path_param, + val_data_path_param, + model_path_param, + instance_type, + instance_count, + output_path_param, + ], + steps=[processing_step, training_step, step_register], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.upsert(role_arn=role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + + execution = pipeline.start() + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass + + +@pytest.mark.skipif( + tests.integ.test_region() not in tests.integ.DRIFT_CHECK_BASELINES_SUPPORTED_REGIONS, + reason=( + "DriftCheckBaselines changes are not fully deployed in" f" {tests.integ.test_region()}." + ), +) +def test_model_registration_with_drift_check_baselines( + sagemaker_session, + role, + pipeline_name, +): + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + + # upload model data to s3 + model_local_path = os.path.join(DATA_DIR, "mxnet_mnist/model.tar.gz") + model_base_uri = "s3://{}/{}/input/model/{}".format( + sagemaker_session.default_bucket(), + "register_model_test_with_drift_baseline", + utils.unique_name_from_base("model"), + ) + model_uri = S3Uploader.upload( + model_local_path, model_base_uri, sagemaker_session=sagemaker_session + ) + model_uri_param = ParameterString(name="model_uri", default_value=model_uri) + + # upload metrics to s3 + metrics_data = ( + '{"regression_metrics": {"mse": {"value": 4.925353410353891, ' + '"standard_deviation": 2.219186917819692}}}' + ) + metrics_base_uri = "s3://{}/{}/input/metrics/{}".format( + sagemaker_session.default_bucket(), + "register_model_test_with_drift_baseline", + utils.unique_name_from_base("metrics"), + ) + metrics_uri = S3Uploader.upload_string_as_file_body( + body=metrics_data, + desired_s3_uri=metrics_base_uri, + sagemaker_session=sagemaker_session, + ) + metrics_uri_param = ParameterString(name="metrics_uri", default_value=metrics_uri) + + model_metrics = ModelMetrics( + bias=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + explainability=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + bias_pre_training=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + bias_post_training=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + ) + drift_check_baselines = DriftCheckBaselines( + model_statistics=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + model_constraints=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + model_data_statistics=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + model_data_constraints=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + bias_config_file=FileSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + bias_pre_training_constraints=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + bias_post_training_constraints=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + explainability_constraints=MetricsSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + explainability_config_file=FileSource( + s3_uri=metrics_uri_param, + content_type="application/json", + ), + ) + customer_metadata_properties = {"key1": "value1"} + estimator = XGBoost( + entry_point="training.py", + source_dir=os.path.join(DATA_DIR, "sip"), + instance_type=instance_type, + instance_count=instance_count, + framework_version="0.90-2", + sagemaker_session=sagemaker_session, + py_version="py3", + role=role, + ) + step_register = RegisterModel( + name="MyRegisterModelStep", + estimator=estimator, + model_data=model_uri_param, + content_types=["application/json"], + response_types=["application/json"], + inference_instances=["ml.t2.medium", "ml.m5.xlarge"], + transform_instances=["ml.m5.xlarge"], + model_package_group_name="testModelPackageGroup", + model_metrics=model_metrics, + drift_check_baselines=drift_check_baselines, + customer_metadata_properties=customer_metadata_properties, + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[ + model_uri_param, + metrics_uri_param, + instance_type, + instance_count, + ], + steps=[step_register], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + + for _ in retries( + max_retry_count=5, + exception_message_prefix="Waiting for a successful execution of pipeline", + seconds_to_sleep=10, + ): + execution = pipeline.start( + parameters={"model_uri": model_uri, "metrics_uri": metrics_uri} + ) + response = execution.describe() + + assert response["PipelineArn"] == create_arn + + try: + execution.wait(delay=30, max_attempts=60) + except WaiterError: + pass + execution_steps = execution.list_steps() + + assert len(execution_steps) == 1 + failure_reason = execution_steps[0].get("FailureReason", "") + if failure_reason != "": + logging.error( + f"Pipeline execution failed with error: {failure_reason}." " Retrying.." + ) + continue + assert execution_steps[0]["StepStatus"] == "Succeeded" + assert execution_steps[0]["StepName"] == "MyRegisterModelStep" + + response = sagemaker_session.sagemaker_client.describe_model_package( + ModelPackageName=execution_steps[0]["Metadata"]["RegisterModel"]["Arn"] + ) + + assert ( + response["ModelMetrics"]["Explainability"]["Report"]["ContentType"] + == "application/json" + ) + assert ( + response["DriftCheckBaselines"]["Bias"]["PreTrainingConstraints"]["ContentType"] + == "application/json" + ) + assert ( + response["DriftCheckBaselines"]["Explainability"]["Constraints"]["ContentType"] + == "application/json" + ) + assert ( + response["DriftCheckBaselines"]["ModelQuality"]["Statistics"]["ContentType"] + == "application/json" + ) + assert ( + response["DriftCheckBaselines"]["ModelDataQuality"]["Statistics"]["ContentType"] + == "application/json" + ) + assert response["CustomerMetadataProperties"] == customer_metadata_properties + break + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_model_registration_with_model_repack( + sagemaker_session, + role, + pipeline_name, + region_name, +): + kms_key = get_or_create_kms_key(sagemaker_session, role) + base_dir = os.path.join(DATA_DIR, "pytorch_mnist") + entry_point = os.path.join(base_dir, "mnist.py") + input_path = sagemaker_session.upload_data( + path=os.path.join(base_dir, "training"), + key_prefix="integ-test-data/pytorch_mnist/training", + ) + inputs = TrainingInput(s3_data=input_path) + + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + good_enough_input = ParameterInteger(name="GoodEnoughInput", default_value=1) + + pytorch_estimator = PyTorch( + entry_point=entry_point, + role=role, + framework_version="1.5.0", + py_version="py3", + instance_count=instance_count, + instance_type=instance_type, + sagemaker_session=sagemaker_session, + output_kms_key=kms_key, + ) + step_train = TrainingStep( + name="pytorch-train", + estimator=pytorch_estimator, + inputs=inputs, + ) + + step_register = RegisterModel( + name="pytorch-register-model", + estimator=pytorch_estimator, + model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, + content_types=["text/csv"], + response_types=["text/csv"], + inference_instances=["ml.t2.medium", "ml.m5.large"], + transform_instances=["ml.m5.large"], + description="test-description", + entry_point=entry_point, + model_kms_key=kms_key, + ) + + model = Model( + image_uri=pytorch_estimator.training_image_uri(), + model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, + sagemaker_session=sagemaker_session, + role=role, + ) + model_inputs = CreateModelInput( + instance_type="ml.m5.large", + accelerator_type="ml.eia1.medium", + ) + step_model = CreateModelStep( + name="pytorch-model", + model=model, + inputs=model_inputs, + ) + + step_cond = ConditionStep( + name="cond-good-enough", + conditions=[ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1)], + if_steps=[step_train, step_register], + else_steps=[step_model], + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[good_enough_input, instance_count, instance_type], + steps=[step_cond], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + + execution = pipeline.start(parameters={"GoodEnoughInput": 0}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass diff --git a/tests/integ/sagemaker/workflow/test_processing_steps.py b/tests/integ/sagemaker/workflow/test_processing_steps.py new file mode 100644 index 0000000000..781bce85a7 --- /dev/null +++ b/tests/integ/sagemaker/workflow/test_processing_steps.py @@ -0,0 +1,798 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import os +import re +import subprocess +from datetime import datetime + +import pytest +from botocore.exceptions import WaiterError + +from sagemaker import image_uris, get_execution_role, utils +from sagemaker.dataset_definition import DatasetDefinition, AthenaDatasetDefinition +from sagemaker.processing import ProcessingInput, ProcessingOutput +from sagemaker.s3 import S3Uploader +from sagemaker.sklearn import SKLearnProcessor +from sagemaker.workflow.parameters import ParameterInteger, ParameterString +from sagemaker.workflow.pipeline import Pipeline +from sagemaker.workflow.steps import ( + ProcessingStep, + CacheConfig, +) +from sagemaker.spark.processing import PySparkProcessor, SparkJarProcessor +from sagemaker.wrangler.processing import DataWranglerProcessor +from tests.integ import DATA_DIR + + +@pytest.fixture +def role(sagemaker_session): + return get_execution_role(sagemaker_session) + + +@pytest.fixture +def pipeline_name(): + return utils.unique_name_from_base("my-pipeline-processing") + + +@pytest.fixture +def region_name(sagemaker_session): + return sagemaker_session.boto_session.region_name + + +@pytest.fixture +def configuration() -> list: + configuration = [ + { + "Classification": "spark-defaults", + "Properties": {"spark.executor.memory": "2g", "spark.executor.cores": "1"}, + }, + { + "Classification": "hadoop-env", + "Properties": {}, + "Configurations": [ + { + "Classification": "export", + "Properties": { + "HADOOP_DATANODE_HEAPSIZE": "2048", + "HADOOP_NAMENODE_OPTS": "-XX:GCTimeRatio=19", + }, + "Configurations": [], + } + ], + }, + { + "Classification": "core-site", + "Properties": {"spark.executor.memory": "2g", "spark.executor.cores": "1"}, + }, + {"Classification": "hadoop-log4j", "Properties": {"key": "value"}}, + { + "Classification": "hive-env", + "Properties": {}, + "Configurations": [ + { + "Classification": "export", + "Properties": { + "HADOOP_DATANODE_HEAPSIZE": "2048", + "HADOOP_NAMENODE_OPTS": "-XX:GCTimeRatio=19", + }, + "Configurations": [], + } + ], + }, + {"Classification": "hive-log4j", "Properties": {"key": "value"}}, + {"Classification": "hive-exec-log4j", "Properties": {"key": "value"}}, + {"Classification": "hive-site", "Properties": {"key": "value"}}, + {"Classification": "spark-defaults", "Properties": {"key": "value"}}, + { + "Classification": "spark-env", + "Properties": {}, + "Configurations": [ + { + "Classification": "export", + "Properties": { + "HADOOP_DATANODE_HEAPSIZE": "2048", + "HADOOP_NAMENODE_OPTS": "-XX:GCTimeRatio=19", + }, + "Configurations": [], + } + ], + }, + {"Classification": "spark-log4j", "Properties": {"key": "value"}}, + {"Classification": "spark-hive-site", "Properties": {"key": "value"}}, + {"Classification": "spark-metrics", "Properties": {"key": "value"}}, + {"Classification": "yarn-site", "Properties": {"key": "value"}}, + { + "Classification": "yarn-env", + "Properties": {}, + "Configurations": [ + { + "Classification": "export", + "Properties": { + "HADOOP_DATANODE_HEAPSIZE": "2048", + "HADOOP_NAMENODE_OPTS": "-XX:GCTimeRatio=19", + }, + "Configurations": [], + } + ], + }, + ] + return configuration + + +@pytest.fixture(scope="module") +def build_jar(): + spark_path = os.path.join(DATA_DIR, "spark") + java_file_path = os.path.join("com", "amazonaws", "..", "spark", "test") + java_version_pattern = r"(\d+\.\d+).*" + jar_file_path = os.path.join(spark_path, "code", "java", "hello-java-spark") + # compile java file + java_version = subprocess.check_output(["java", "-version"], stderr=subprocess.STDOUT).decode( + "utf-8" + ) + java_version = re.search(java_version_pattern, java_version).groups()[0] + + if float(java_version) > 1.8: + subprocess.run( + [ + "javac", + "--release", + "8", + os.path.join(jar_file_path, java_file_path, "HelloJavaSparkApp.java"), + ] + ) + else: + subprocess.run( + [ + "javac", + os.path.join(jar_file_path, java_file_path, "HelloJavaSparkApp.java"), + ] + ) + + subprocess.run( + [ + "jar", + "cfm", + os.path.join(jar_file_path, "hello-spark-java.jar"), + os.path.join(jar_file_path, "manifest.txt"), + "-C", + jar_file_path, + ".", + ] + ) + yield + subprocess.run(["rm", os.path.join(jar_file_path, "hello-spark-java.jar")]) + subprocess.run(["rm", os.path.join(jar_file_path, java_file_path, "HelloJavaSparkApp.class")]) + + +@pytest.fixture +def athena_dataset_definition(sagemaker_session): + return DatasetDefinition( + local_path="/opt/ml/processing/input/add", + data_distribution_type="FullyReplicated", + input_mode="File", + athena_dataset_definition=AthenaDatasetDefinition( + catalog="AwsDataCatalog", + database="default", + work_group="workgroup", + query_string=('SELECT * FROM "default"."s3_test_table_$STAGE_$REGIONUNDERSCORED";'), + output_s3_uri=f"s3://{sagemaker_session.default_bucket()}/add", + output_format="JSON", + output_compression="GZIP", + ), + ) + + +def test_one_step_sklearn_processing_pipeline( + sagemaker_session, + role, + sklearn_latest_version, + cpu_instance_type, + pipeline_name, + region_name, + athena_dataset_definition, +): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + script_path = os.path.join(DATA_DIR, "dummy_script.py") + input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") + inputs = [ + ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/"), + ProcessingInput(dataset_definition=athena_dataset_definition), + ] + + cache_config = CacheConfig(enable_caching=True, expire_after="T30m") + + sklearn_processor = SKLearnProcessor( + framework_version=sklearn_latest_version, + role=role, + instance_type=cpu_instance_type, + instance_count=instance_count, + command=["python3"], + sagemaker_session=sagemaker_session, + base_job_name="test-sklearn", + ) + + step_sklearn = ProcessingStep( + name="sklearn-process", + processor=sklearn_processor, + inputs=inputs, + code=script_path, + cache_config=cache_config, + ) + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=[step_sklearn], + sagemaker_session=sagemaker_session, + ) + + try: + # NOTE: We should exercise the case when role used in the pipeline execution is + # different than that required of the steps in the pipeline itself. The role in + # the pipeline definition needs to create training and processing jobs and other + # sagemaker entities. However, the jobs created in the steps themselves execute + # under a potentially different role, often requiring access to S3 and other + # artifacts not required to during creation of the jobs in the pipeline steps. + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + + response = execution.describe() + assert response["PipelineArn"] == create_arn + + # Check CacheConfig + response = json.loads(pipeline.describe()["PipelineDefinition"])["Steps"][0]["CacheConfig"] + assert response["Enabled"] == cache_config.enable_caching + assert response["ExpireAfter"] == cache_config.expire_after + + try: + execution.wait(delay=30, max_attempts=3) + except WaiterError: + pass + execution_steps = execution.list_steps() + assert len(execution_steps) == 1 + assert execution_steps[0]["StepName"] == "sklearn-process" + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_one_step_framework_processing_pipeline( + sagemaker_session, + role, + sklearn_latest_version, + cpu_instance_type, + pipeline_name, + region_name, + athena_dataset_definition, +): + """Use `SKLearnProcessor` to test `FrameworkProcessor`.""" + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + script_path = os.path.join(DATA_DIR, "dummy_script.py") + input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") + + inputs = [ + ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/"), + ProcessingInput(dataset_definition=athena_dataset_definition), + ] + + cache_config = CacheConfig(enable_caching=True, expire_after="T30m") + + sklearn_processor = SKLearnProcessor( + framework_version=sklearn_latest_version, + role=role, + instance_type=cpu_instance_type, + instance_count=instance_count, + sagemaker_session=sagemaker_session, + base_job_name="test-sklearn", + ) + + run_args = sklearn_processor.get_run_args(code=script_path, inputs=inputs) + + step_sklearn = ProcessingStep( + name="sklearn-process", + processor=sklearn_processor, + inputs=run_args.inputs, + outputs=run_args.outputs, + job_arguments=run_args.arguments, + code=run_args.code, + cache_config=cache_config, + ) + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=[step_sklearn], + sagemaker_session=sagemaker_session, + ) + + try: + # NOTE: We should exercise the case when role used in the pipeline execution is + # different than that required of the steps in the pipeline itself. The role in + # the pipeline definition needs to create training and processing jobs and other + # sagemaker entities. However, the jobs created in the steps themselves execute + # under a potentially different role, often requiring access to S3 and other + # artifacts not required to during creation of the jobs in the pipeline steps. + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] + response = pipeline.update(role) + update_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + update_arn, + ) + + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + + response = execution.describe() + assert response["PipelineArn"] == create_arn + + # Check CacheConfig + response = json.loads(pipeline.describe()["PipelineDefinition"])["Steps"][0]["CacheConfig"] + assert response["Enabled"] == cache_config.enable_caching + assert response["ExpireAfter"] == cache_config.expire_after + + try: + execution.wait(delay=30, max_attempts=3) + except WaiterError: + pass + execution_steps = execution.list_steps() + assert len(execution_steps) == 1 + assert execution_steps[0]["StepName"] == "sklearn-process" + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_one_step_pyspark_processing_pipeline( + sagemaker_session, + role, + cpu_instance_type, + pipeline_name, + region_name, +): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + script_path = os.path.join(DATA_DIR, "dummy_script.py") + + cache_config = CacheConfig(enable_caching=True, expire_after="T30m") + + pyspark_processor = PySparkProcessor( + base_job_name="sm-spark", + framework_version="2.4", + role=role, + instance_count=instance_count, + instance_type=cpu_instance_type, + max_runtime_in_seconds=1200, + sagemaker_session=sagemaker_session, + ) + + spark_run_args = pyspark_processor.get_run_args( + submit_app=script_path, + arguments=[ + "--s3_input_bucket", + sagemaker_session.default_bucket(), + "--s3_input_key_prefix", + "spark-input", + "--s3_output_bucket", + sagemaker_session.default_bucket(), + "--s3_output_key_prefix", + "spark-output", + ], + ) + + step_pyspark = ProcessingStep( + name="pyspark-process", + processor=pyspark_processor, + inputs=spark_run_args.inputs, + outputs=spark_run_args.outputs, + job_arguments=spark_run_args.arguments, + code=spark_run_args.code, + cache_config=cache_config, + ) + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=[step_pyspark], + sagemaker_session=sagemaker_session, + ) + + try: + # NOTE: We should exercise the case when role used in the pipeline execution is + # different than that required of the steps in the pipeline itself. The role in + # the pipeline definition needs to create training and processing jobs and other + # sagemaker entities. However, the jobs created in the steps themselves execute + # under a potentially different role, often requiring access to S3 and other + # artifacts not required to during creation of the jobs in the pipeline steps. + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] + response = pipeline.update(role) + update_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + update_arn, + ) + + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + + response = execution.describe() + assert response["PipelineArn"] == create_arn + + # Check CacheConfig + response = json.loads(pipeline.describe()["PipelineDefinition"])["Steps"][0]["CacheConfig"] + assert response["Enabled"] == cache_config.enable_caching + assert response["ExpireAfter"] == cache_config.expire_after + + try: + execution.wait(delay=30, max_attempts=3) + except WaiterError: + pass + execution_steps = execution.list_steps() + assert len(execution_steps) == 1 + assert execution_steps[0]["StepName"] == "pyspark-process" + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_one_step_sparkjar_processing_pipeline( + sagemaker_session, + role, + cpu_instance_type, + pipeline_name, + region_name, + configuration, + build_jar, +): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + cache_config = CacheConfig(enable_caching=True, expire_after="T30m") + spark_path = os.path.join(DATA_DIR, "spark") + + spark_jar_processor = SparkJarProcessor( + role=role, + instance_count=2, + instance_type=cpu_instance_type, + sagemaker_session=sagemaker_session, + framework_version="2.4", + ) + bucket = spark_jar_processor.sagemaker_session.default_bucket() + with open(os.path.join(spark_path, "files", "data.jsonl")) as data: + body = data.read() + input_data_uri = f"s3://{bucket}/spark/input/data.jsonl" + S3Uploader.upload_string_as_file_body( + body=body, + desired_s3_uri=input_data_uri, + sagemaker_session=sagemaker_session, + ) + output_data_uri = f"s3://{bucket}/spark/output/sales/{datetime.now().isoformat()}" + + java_project_dir = os.path.join(spark_path, "code", "java", "hello-java-spark") + spark_run_args = spark_jar_processor.get_run_args( + submit_app=f"{java_project_dir}/hello-spark-java.jar", + submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp", + arguments=["--input", input_data_uri, "--output", output_data_uri], + configuration=configuration, + ) + + step_pyspark = ProcessingStep( + name="sparkjar-process", + processor=spark_jar_processor, + inputs=spark_run_args.inputs, + outputs=spark_run_args.outputs, + job_arguments=spark_run_args.arguments, + code=spark_run_args.code, + cache_config=cache_config, + ) + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=[step_pyspark], + sagemaker_session=sagemaker_session, + ) + + try: + # NOTE: We should exercise the case when role used in the pipeline execution is + # different than that required of the steps in the pipeline itself. The role in + # the pipeline definition needs to create training and processing jobs and other + # sagemaker entities. However, the jobs created in the steps themselves execute + # under a potentially different role, often requiring access to S3 and other + # artifacts not required to during creation of the jobs in the pipeline steps. + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] + response = pipeline.update(role) + update_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + update_arn, + ) + + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + + response = execution.describe() + assert response["PipelineArn"] == create_arn + + # Check CacheConfig + response = json.loads(pipeline.describe()["PipelineDefinition"])["Steps"][0]["CacheConfig"] + assert response["Enabled"] == cache_config.enable_caching + assert response["ExpireAfter"] == cache_config.expire_after + + try: + execution.wait(delay=30, max_attempts=3) + except WaiterError: + pass + execution_steps = execution.list_steps() + assert len(execution_steps) == 1 + assert execution_steps[0]["StepName"] == "sparkjar-process" + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_one_step_data_wrangler_processing_pipeline(sagemaker_session, role, pipeline_name): + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") + + recipe_file_path = os.path.join(DATA_DIR, "workflow", "dummy_recipe.flow") + input_file_path = os.path.join(DATA_DIR, "workflow", "dummy_data.csv") + + output_name = "3f74973c-fd1e-4845-89f8-0dd400031be9.default" + output_content_type = "CSV" + output_config = {output_name: {"content_type": output_content_type}} + job_argument = [f"--output-config '{json.dumps(output_config)}'"] + + inputs = [ + ProcessingInput( + input_name="dummy_data.csv", + source=input_file_path, + destination="/opt/ml/processing/dummy_data.csv", + ) + ] + + output_s3_uri = f"s3://{sagemaker_session.default_bucket()}/output" + outputs = [ + ProcessingOutput( + output_name=output_name, + source="/opt/ml/processing/output", + destination=output_s3_uri, + s3_upload_mode="EndOfJob", + ) + ] + + data_wrangler_processor = DataWranglerProcessor( + role=role, + data_wrangler_flow_source=recipe_file_path, + instance_count=instance_count, + instance_type=instance_type, + sagemaker_session=sagemaker_session, + max_runtime_in_seconds=86400, + ) + + data_wrangler_step = ProcessingStep( + name="data-wrangler-step", + processor=data_wrangler_processor, + inputs=inputs, + outputs=outputs, + job_arguments=job_argument, + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count, instance_type], + steps=[data_wrangler_step], + sagemaker_session=sagemaker_session, + ) + + definition = json.loads(pipeline.definition()) + expected_image_uri = image_uris.retrieve( + "data-wrangler", region=sagemaker_session.boto_region_name + ) + assert len(definition["Steps"]) == 1 + assert definition["Steps"][0]["Arguments"]["AppSpecification"]["ImageUri"] is not None + assert definition["Steps"][0]["Arguments"]["AppSpecification"]["ImageUri"] == expected_image_uri + + assert definition["Steps"][0]["Arguments"]["ProcessingInputs"] is not None + processing_inputs = definition["Steps"][0]["Arguments"]["ProcessingInputs"] + assert len(processing_inputs) == 2 + for processing_input in processing_inputs: + if processing_input["InputName"] == "flow": + assert processing_input["S3Input"]["S3Uri"].endswith(".flow") + assert processing_input["S3Input"]["LocalPath"] == "/opt/ml/processing/flow" + elif processing_input["InputName"] == "dummy_data.csv": + assert processing_input["S3Input"]["S3Uri"].endswith(".csv") + assert processing_input["S3Input"]["LocalPath"] == "/opt/ml/processing/dummy_data.csv" + else: + raise AssertionError("Unknown input name") + assert definition["Steps"][0]["Arguments"]["ProcessingOutputConfig"] is not None + processing_outputs = definition["Steps"][0]["Arguments"]["ProcessingOutputConfig"]["Outputs"] + assert len(processing_outputs) == 1 + assert processing_outputs[0]["OutputName"] == output_name + assert processing_outputs[0]["S3Output"] is not None + assert processing_outputs[0]["S3Output"]["LocalPath"] == "/opt/ml/processing/output" + assert processing_outputs[0]["S3Output"]["S3Uri"] == output_s3_uri + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + + execution = pipeline.start() + response = execution.describe() + assert response["PipelineArn"] == create_arn + + try: + execution.wait(delay=60, max_attempts=10) + except WaiterError: + pass + + execution_steps = execution.list_steps() + assert len(execution_steps) == 1 + assert execution_steps[0]["StepName"] == "data-wrangler-step" + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_two_processing_job_depends_on( + sagemaker_session, + role, + pipeline_name, + region_name, + cpu_instance_type, +): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + script_path = os.path.join(DATA_DIR, "dummy_script.py") + + pyspark_processor = PySparkProcessor( + base_job_name="sm-spark", + framework_version="2.4", + role=role, + instance_count=instance_count, + instance_type=cpu_instance_type, + max_runtime_in_seconds=1200, + sagemaker_session=sagemaker_session, + ) + + spark_run_args = pyspark_processor.get_run_args( + submit_app=script_path, + arguments=[ + "--s3_input_bucket", + sagemaker_session.default_bucket(), + "--s3_input_key_prefix", + "spark-input", + "--s3_output_bucket", + sagemaker_session.default_bucket(), + "--s3_output_key_prefix", + "spark-output", + ], + ) + + step_pyspark_1 = ProcessingStep( + name="pyspark-process-1", + processor=pyspark_processor, + inputs=spark_run_args.inputs, + outputs=spark_run_args.outputs, + job_arguments=spark_run_args.arguments, + code=spark_run_args.code, + ) + + step_pyspark_2 = ProcessingStep( + name="pyspark-process-2", + depends_on=[step_pyspark_1], + processor=pyspark_processor, + inputs=spark_run_args.inputs, + outputs=spark_run_args.outputs, + job_arguments=spark_run_args.arguments, + code=spark_run_args.code, + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=[step_pyspark_1, step_pyspark_2], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] + response = pipeline.update(role) + update_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + update_arn, + ) + + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + + response = execution.describe() + assert response["PipelineArn"] == create_arn + + try: + execution.wait(delay=60) + except WaiterError: + pass + + execution_steps = execution.list_steps() + assert len(execution_steps) == 2 + time_stamp = {} + for execution_step in execution_steps: + name = execution_step["StepName"] + if name == "pyspark-process-1": + time_stamp[name] = execution_step["EndTime"] + else: + time_stamp[name] = execution_step["StartTime"] + assert time_stamp["pyspark-process-1"] < time_stamp["pyspark-process-2"] + finally: + try: + pipeline.delete() + except Exception: + pass diff --git a/tests/integ/test_workflow_with_quality_check_steps.py b/tests/integ/sagemaker/workflow/test_quality_check_steps.py similarity index 95% rename from tests/integ/test_workflow_with_quality_check_steps.py rename to tests/integ/sagemaker/workflow/test_quality_check_steps.py index 8214f1b5f2..043989008e 100644 --- a/tests/integ/test_workflow_with_quality_check_steps.py +++ b/tests/integ/sagemaker/workflow/test_quality_check_steps.py @@ -18,7 +18,6 @@ import pytest from botocore.exceptions import WaiterError -import tests from sagemaker.workflow.parameters import ParameterString from tests.integ import DATA_DIR @@ -122,10 +121,6 @@ def model_quality_supplied_baseline_statistics(sagemaker_session): ).file_s3_uri -@pytest.mark.skipif( - tests.integ.test_region() in tests.integ.NO_SM_PIPELINE_MM_CLARIFY_CHECK_STEP_REGIONS, - reason=f"QualityCheckStep is not fully deployed in {tests.integ.test_region()}", -) def test_one_step_data_quality_pipeline_happycase( sagemaker_session, role, @@ -220,10 +215,6 @@ def test_one_step_data_quality_pipeline_happycase( pass -@pytest.mark.skipif( - tests.integ.test_region() in tests.integ.NO_SM_PIPELINE_MM_CLARIFY_CHECK_STEP_REGIONS, - reason=f"QualityCheckStep is not fully deployed in {tests.integ.test_region()}", -) def test_one_step_data_quality_pipeline_constraint_violation( sagemaker_session, role, @@ -299,10 +290,6 @@ def test_one_step_data_quality_pipeline_constraint_violation( pass -@pytest.mark.skipif( - tests.integ.test_region() in tests.integ.NO_SM_PIPELINE_MM_CLARIFY_CHECK_STEP_REGIONS, - reason=f"QualityCheckStep is not fully deployed in {tests.integ.test_region()}", -) def test_one_step_model_quality_pipeline_happycase( sagemaker_session, role, @@ -398,10 +385,6 @@ def test_one_step_model_quality_pipeline_happycase( pass -@pytest.mark.skipif( - tests.integ.test_region() in tests.integ.NO_SM_PIPELINE_MM_CLARIFY_CHECK_STEP_REGIONS, - reason=f"QualityCheckStep is not fully deployed in {tests.integ.test_region()}", -) def test_one_step_model_quality_pipeline_constraint_violation( sagemaker_session, role, diff --git a/tests/integ/test_workflow_retry.py b/tests/integ/sagemaker/workflow/test_retry.py similarity index 100% rename from tests/integ/test_workflow_retry.py rename to tests/integ/sagemaker/workflow/test_retry.py diff --git a/tests/integ/sagemaker/workflow/test_training_steps.py b/tests/integ/sagemaker/workflow/test_training_steps.py new file mode 100644 index 0000000000..0f1ba84a55 --- /dev/null +++ b/tests/integ/sagemaker/workflow/test_training_steps.py @@ -0,0 +1,153 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import uuid +import logging + +import pytest +from botocore.exceptions import WaiterError + +from sagemaker import TrainingInput, get_execution_role, utils +from sagemaker.debugger import ( + DebuggerHookConfig, + Rule, + rule_configs, +) +from sagemaker.pytorch.estimator import PyTorch +from sagemaker.workflow.parameters import ParameterInteger, ParameterString +from sagemaker.workflow.pipeline import Pipeline +from sagemaker.workflow.steps import TrainingStep +from tests.integ.retry import retries +from tests.integ import DATA_DIR + + +@pytest.fixture +def role(sagemaker_session): + return get_execution_role(sagemaker_session) + + +@pytest.fixture +def pipeline_name(): + return utils.unique_name_from_base("my-pipeline-training") + + +@pytest.fixture +def region_name(sagemaker_session): + return sagemaker_session.boto_session.region_name + + +def test_training_job_with_debugger_and_profiler( + sagemaker_session, + pipeline_name, + role, + pytorch_training_latest_version, + pytorch_training_latest_py_version, +): + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + + rules = [ + Rule.sagemaker(rule_configs.vanishing_gradient()), + Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), + Rule.sagemaker(rule_configs.loss_not_decreasing()), + ] + debugger_hook_config = DebuggerHookConfig( + s3_output_path=(f"s3://{sagemaker_session.default_bucket()}/{uuid.uuid4()}/tensors") + ) + + base_dir = os.path.join(DATA_DIR, "pytorch_mnist") + script_path = os.path.join(base_dir, "mnist.py") + input_path = sagemaker_session.upload_data( + path=os.path.join(base_dir, "training"), + key_prefix="integ-test-data/pytorch_mnist/training", + ) + inputs = TrainingInput(s3_data=input_path) + + pytorch_estimator = PyTorch( + entry_point=script_path, + role="SageMakerRole", + framework_version=pytorch_training_latest_version, + py_version=pytorch_training_latest_py_version, + instance_count=instance_count, + instance_type=instance_type, + sagemaker_session=sagemaker_session, + rules=rules, + debugger_hook_config=debugger_hook_config, + ) + + step_train = TrainingStep( + name="pytorch-train", + estimator=pytorch_estimator, + inputs=inputs, + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count, instance_type], + steps=[step_train], + sagemaker_session=sagemaker_session, + ) + + for _ in retries( + max_retry_count=5, + exception_message_prefix="Waiting for a successful execution of pipeline", + seconds_to_sleep=10, + ): + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + + execution = pipeline.start() + response = execution.describe() + assert response["PipelineArn"] == create_arn + + try: + execution.wait(delay=10, max_attempts=60) + except WaiterError: + pass + execution_steps = execution.list_steps() + + assert len(execution_steps) == 1 + failure_reason = execution_steps[0].get("FailureReason", "") + if failure_reason != "": + logging.error(f"Pipeline execution failed with error: {failure_reason}.Retrying..") + continue + assert execution_steps[0]["StepName"] == "pytorch-train" + assert execution_steps[0]["StepStatus"] == "Succeeded" + + training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"] + job_description = sagemaker_session.sagemaker_client.describe_training_job( + TrainingJobName=training_job_arn.split("/")[1] + ) + + for index, rule in enumerate(rules): + config = job_description["DebugRuleConfigurations"][index] + assert config["RuleConfigurationName"] == rule.name + assert config["RuleEvaluatorImage"] == rule.image_uri + assert config["VolumeSizeInGB"] == 0 + assert ( + config["RuleParameters"]["rule_to_invoke"] + == rule.rule_parameters["rule_to_invoke"] + ) + assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict() + + assert job_description["ProfilingStatus"] == "Enabled" + assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500 + break + finally: + try: + pipeline.delete() + except Exception: + pass diff --git a/tests/integ/sagemaker/workflow/test_tuning_steps.py b/tests/integ/sagemaker/workflow/test_tuning_steps.py new file mode 100644 index 0000000000..7cfb542cb6 --- /dev/null +++ b/tests/integ/sagemaker/workflow/test_tuning_steps.py @@ -0,0 +1,317 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import re + +import pytest + +from sagemaker import TrainingInput, Model, get_execution_role, utils +from sagemaker.dataset_definition import DatasetDefinition, AthenaDatasetDefinition +from sagemaker.inputs import CreateModelInput +from sagemaker.processing import ProcessingInput, ProcessingOutput +from sagemaker.pytorch import PyTorch +from sagemaker.sklearn import SKLearnProcessor +from sagemaker.tuner import HyperparameterTuner, IntegerParameter +from sagemaker.workflow.functions import JsonGet +from sagemaker.workflow.parameters import ParameterInteger, ParameterString +from sagemaker.workflow.pipeline import Pipeline + +from sagemaker.workflow.steps import ( + CreateModelStep, + ProcessingStep, + TuningStep, + PropertyFile, +) +from tests.integ import DATA_DIR + + +@pytest.fixture +def role(sagemaker_session): + return get_execution_role(sagemaker_session) + + +@pytest.fixture +def pipeline_name(): + return utils.unique_name_from_base("my-pipeline-training") + + +@pytest.fixture +def region_name(sagemaker_session): + return sagemaker_session.boto_session.region_name + + +@pytest.fixture +def script_dir(): + return os.path.join(DATA_DIR, "sklearn_processing") + + +@pytest.fixture +def athena_dataset_definition(sagemaker_session): + return DatasetDefinition( + local_path="/opt/ml/processing/input/add", + data_distribution_type="FullyReplicated", + input_mode="File", + athena_dataset_definition=AthenaDatasetDefinition( + catalog="AwsDataCatalog", + database="default", + work_group="workgroup", + query_string=('SELECT * FROM "default"."s3_test_table_$STAGE_$REGIONUNDERSCORED";'), + output_s3_uri=f"s3://{sagemaker_session.default_bucket()}/add", + output_format="JSON", + output_compression="GZIP", + ), + ) + + +def test_tuning_single_algo( + sagemaker_session, + role, + cpu_instance_type, + pipeline_name, + region_name, +): + base_dir = os.path.join(DATA_DIR, "pytorch_mnist") + entry_point = os.path.join(base_dir, "mnist.py") + input_path = sagemaker_session.upload_data( + path=os.path.join(base_dir, "training"), + key_prefix="integ-test-data/pytorch_mnist/training", + ) + inputs = TrainingInput(s3_data=input_path) + + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + + pytorch_estimator = PyTorch( + entry_point=entry_point, + role=role, + framework_version="1.5.0", + py_version="py3", + instance_count=instance_count, + instance_type=instance_type, + sagemaker_session=sagemaker_session, + enable_sagemaker_metrics=True, + max_retry_attempts=3, + ) + + min_batch_size = ParameterString(name="MinBatchSize", default_value="64") + max_batch_size = ParameterString(name="MaxBatchSize", default_value="128") + hyperparameter_ranges = { + "batch-size": IntegerParameter(min_batch_size, max_batch_size), + } + + tuner = HyperparameterTuner( + estimator=pytorch_estimator, + objective_metric_name="test:acc", + objective_type="Maximize", + hyperparameter_ranges=hyperparameter_ranges, + metric_definitions=[{"Name": "test:acc", "Regex": "Overall test accuracy: (.*?);"}], + max_jobs=2, + max_parallel_jobs=2, + ) + + step_tune = TuningStep( + name="my-tuning-step", + tuner=tuner, + inputs=inputs, + ) + + best_model = Model( + image_uri=pytorch_estimator.training_image_uri(), + model_data=step_tune.get_top_model_s3_uri( + top_k=0, + s3_bucket=sagemaker_session.default_bucket(), + ), + sagemaker_session=sagemaker_session, + role=role, + ) + model_inputs = CreateModelInput( + instance_type="ml.m5.large", + accelerator_type="ml.eia1.medium", + ) + step_best_model = CreateModelStep( + name="1st-model", + model=best_model, + inputs=model_inputs, + ) + + second_best_model = Model( + image_uri=pytorch_estimator.training_image_uri(), + model_data=step_tune.get_top_model_s3_uri( + top_k=1, + s3_bucket=sagemaker_session.default_bucket(), + ), + sagemaker_session=sagemaker_session, + role=role, + ) + + step_second_best_model = CreateModelStep( + name="2nd-best-model", + model=second_best_model, + inputs=model_inputs, + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count, instance_type, min_batch_size, max_batch_size], + steps=[step_tune, step_best_model, step_second_best_model], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_tuning_multi_algos( + sagemaker_session, + role, + cpu_instance_type, + pipeline_name, + region_name, + script_dir, + athena_dataset_definition, +): + base_dir = os.path.join(DATA_DIR, "pytorch_mnist") + entry_point = os.path.join(base_dir, "mnist.py") + input_path = sagemaker_session.upload_data( + path=os.path.join(base_dir, "training"), + key_prefix="integ-test-data/pytorch_mnist/training", + ) + + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + + input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" + + sklearn_processor = SKLearnProcessor( + framework_version="0.20.0", + instance_type=instance_type, + instance_count=instance_count, + base_job_name="test-sklearn", + sagemaker_session=sagemaker_session, + role=role, + ) + + property_file = PropertyFile( + name="DataAttributes", output_name="attributes", path="attributes.json" + ) + + step_process = ProcessingStep( + name="my-process", + display_name="ProcessingStep", + description="description for Processing step", + processor=sklearn_processor, + inputs=[ + ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), + ProcessingInput(dataset_definition=athena_dataset_definition), + ], + outputs=[ + ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), + ProcessingOutput(output_name="attributes", source="/opt/ml/processing/attributes.json"), + ], + property_files=[property_file], + code=os.path.join(script_dir, "preprocessing.py"), + ) + + static_hp_1 = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + json_get_hp = JsonGet( + step_name=step_process.name, property_file=property_file, json_path="train_size" + ) + pytorch_estimator = PyTorch( + entry_point=entry_point, + role=role, + framework_version="1.5.0", + py_version="py3", + instance_count=instance_count, + instance_type=instance_type, + sagemaker_session=sagemaker_session, + enable_sagemaker_metrics=True, + max_retry_attempts=3, + hyperparameters={"static-hp": static_hp_1, "train_size": json_get_hp}, + ) + + min_batch_size = ParameterString(name="MinBatchSize", default_value="64") + max_batch_size = json_get_hp + + tuner = HyperparameterTuner.create( + estimator_dict={ + "estimator-1": pytorch_estimator, + "estimator-2": pytorch_estimator, + }, + objective_metric_name_dict={ + "estimator-1": "test:acc", + "estimator-2": "test:acc", + }, + hyperparameter_ranges_dict={ + "estimator-1": {"batch-size": IntegerParameter(min_batch_size, max_batch_size)}, + "estimator-2": {"batch-size": IntegerParameter(min_batch_size, max_batch_size)}, + }, + metric_definitions_dict={ + "estimator-1": [{"Name": "test:acc", "Regex": "Overall test accuracy: (.*?);"}], + "estimator-2": [{"Name": "test:acc", "Regex": "Overall test accuracy: (.*?);"}], + }, + ) + + inputs = { + "estimator-1": TrainingInput(s3_data=input_path), + "estimator-2": TrainingInput(s3_data=input_path), + } + + step_tune = TuningStep( + name="my-tuning-step", + tuner=tuner, + inputs=inputs, + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count, instance_type, min_batch_size, max_batch_size], + steps=[step_process, step_tune], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + execution = pipeline.start(parameters={}) + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", + execution.arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass diff --git a/tests/integ/sagemaker/workflow/test_workflow.py b/tests/integ/sagemaker/workflow/test_workflow.py new file mode 100644 index 0000000000..e0c2c3219d --- /dev/null +++ b/tests/integ/sagemaker/workflow/test_workflow.py @@ -0,0 +1,1010 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import os +import re +import time + +from contextlib import contextmanager +import pytest + +from botocore.exceptions import WaiterError +import pandas as pd + +from tests.integ.timeout import timeout + +from sagemaker.session import Session +from sagemaker import image_uris +from sagemaker.estimator import Estimator +from sagemaker.inputs import CreateModelInput, TrainingInput +from sagemaker.model import Model +from sagemaker.model_metrics import MetricsSource, ModelMetrics +from sagemaker.processing import ( + ProcessingInput, + ProcessingOutput, + FeatureStoreOutput, + ScriptProcessor, +) +from sagemaker.s3 import S3Uploader +from sagemaker.session import get_execution_role +from sagemaker.sklearn.estimator import SKLearn +from sagemaker.transformer import Transformer +from sagemaker.sklearn.processing import SKLearnProcessor +from sagemaker.workflow.conditions import ( + ConditionGreaterThanOrEqualTo, + ConditionLessThanOrEqualTo, +) +from sagemaker.workflow.condition_step import ConditionStep +from sagemaker.workflow.callback_step import ( + CallbackStep, + CallbackOutput, + CallbackOutputTypeEnum, +) +from sagemaker.wrangler.processing import DataWranglerProcessor +from sagemaker.dataset_definition.inputs import ( + DatasetDefinition, + AthenaDatasetDefinition, +) +from sagemaker.workflow.execution_variables import ExecutionVariables +from sagemaker.workflow.functions import Join, JsonGet +from sagemaker.wrangler.ingestion import generate_data_ingestion_flow_from_s3_input +from sagemaker.workflow.parameters import ( + ParameterInteger, + ParameterString, +) +from sagemaker.workflow.steps import ( + CreateModelStep, + ProcessingStep, + TrainingStep, + TransformStep, + TransformInput, + PropertyFile, +) +from sagemaker.workflow.step_collections import RegisterModel +from sagemaker.workflow.pipeline import Pipeline +from sagemaker.feature_store.feature_group import ( + FeatureGroup, + FeatureDefinition, + FeatureTypeEnum, +) +from tests.integ import DATA_DIR + + +def ordered(obj): + """Helper function for dict comparison""" + if isinstance(obj, dict): + return sorted((k, ordered(v)) for k, v in obj.items()) + if isinstance(obj, list): + return sorted(ordered(x) for x in obj) + else: + return obj + + +@pytest.fixture(scope="module") +def region_name(sagemaker_session): + return sagemaker_session.boto_session.region_name + + +@pytest.fixture(scope="module") +def role(sagemaker_session): + return get_execution_role(sagemaker_session) + + +@pytest.fixture(scope="module") +def script_dir(): + return os.path.join(DATA_DIR, "sklearn_processing") + + +@pytest.fixture(scope="module") +def feature_store_session(sagemaker_session): + boto_session = sagemaker_session.boto_session + sagemaker_client = boto_session.client("sagemaker") + featurestore_runtime_client = boto_session.client("sagemaker-featurestore-runtime") + + return Session( + boto_session=boto_session, + sagemaker_client=sagemaker_client, + sagemaker_featurestore_runtime_client=featurestore_runtime_client, + ) + + +@pytest.fixture +def pipeline_name(): + return f"my-pipeline-{int(time.time() * 10 ** 7)}" + + +@pytest.fixture(scope="module") +def athena_dataset_definition(sagemaker_session): + return DatasetDefinition( + local_path="/opt/ml/processing/input/add", + data_distribution_type="FullyReplicated", + input_mode="File", + athena_dataset_definition=AthenaDatasetDefinition( + catalog="AwsDataCatalog", + database="default", + work_group="workgroup", + query_string=('SELECT * FROM "default"."s3_test_table_$STAGE_$REGIONUNDERSCORED";'), + output_s3_uri=f"s3://{sagemaker_session.default_bucket()}/add", + output_format="JSON", + output_compression="GZIP", + ), + ) + + +def test_three_step_definition( + sagemaker_session, + region_name, + role, + script_dir, + pipeline_name, + athena_dataset_definition, +): + framework_version = "0.20.0" + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + output_prefix = ParameterString(name="OutputPrefix", default_value="output") + + input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" + + sklearn_processor = SKLearnProcessor( + framework_version=framework_version, + instance_type=instance_type, + instance_count=instance_count, + base_job_name="test-sklearn", + sagemaker_session=sagemaker_session, + role=role, + ) + step_process = ProcessingStep( + name="my-process", + display_name="ProcessingStep", + description="description for Processing step", + processor=sklearn_processor, + inputs=[ + ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), + ProcessingInput(dataset_definition=athena_dataset_definition), + ], + outputs=[ + ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), + ProcessingOutput( + output_name="test_data", + source="/opt/ml/processing/test", + destination=Join( + on="/", + values=[ + "s3:/", + sagemaker_session.default_bucket(), + "test-sklearn", + output_prefix, + ExecutionVariables.PIPELINE_EXECUTION_ID, + ], + ), + ), + ], + code=os.path.join(script_dir, "preprocessing.py"), + ) + + sklearn_train = SKLearn( + framework_version=framework_version, + entry_point=os.path.join(script_dir, "train.py"), + instance_type=instance_type, + sagemaker_session=sagemaker_session, + role=role, + ) + step_train = TrainingStep( + name="my-train", + display_name="TrainingStep", + description="description for Training step", + estimator=sklearn_train, + inputs=TrainingInput( + s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ + "train_data" + ].S3Output.S3Uri + ), + ) + + model = Model( + image_uri=sklearn_train.image_uri, + model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, + sagemaker_session=sagemaker_session, + role=role, + ) + model_inputs = CreateModelInput( + instance_type="ml.m5.large", + accelerator_type="ml.eia1.medium", + ) + step_model = CreateModelStep( + name="my-model", + display_name="ModelStep", + description="description for Model step", + model=model, + inputs=model_inputs, + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_type, instance_count, output_prefix], + steps=[step_process, step_train, step_model], + sagemaker_session=sagemaker_session, + ) + + definition = json.loads(pipeline.definition()) + assert definition["Version"] == "2020-12-01" + + assert set(tuple(param.items()) for param in definition["Parameters"]) == set( + [ + tuple( + { + "Name": "InstanceType", + "Type": "String", + "DefaultValue": "ml.m5.xlarge", + }.items() + ), + tuple({"Name": "InstanceCount", "Type": "Integer", "DefaultValue": 1}.items()), + tuple( + { + "Name": "OutputPrefix", + "Type": "String", + "DefaultValue": "output", + }.items() + ), + ] + ) + + steps = definition["Steps"] + assert len(steps) == 3 + + names_and_types = [] + display_names_and_desc = [] + processing_args = {} + training_args = {} + for step in steps: + names_and_types.append((step["Name"], step["Type"])) + display_names_and_desc.append((step["DisplayName"], step["Description"])) + if step["Type"] == "Processing": + processing_args = step["Arguments"] + if step["Type"] == "Training": + training_args = step["Arguments"] + if step["Type"] == "Model": + model_args = step["Arguments"] + + assert set(names_and_types) == set( + [ + ("my-process", "Processing"), + ("my-train", "Training"), + ("my-model", "Model"), + ] + ) + + assert set(display_names_and_desc) == set( + [ + ("ProcessingStep", "description for Processing step"), + ("TrainingStep", "description for Training step"), + ("ModelStep", "description for Model step"), + ] + ) + assert processing_args["ProcessingResources"]["ClusterConfig"] == { + "InstanceType": {"Get": "Parameters.InstanceType"}, + "InstanceCount": {"Get": "Parameters.InstanceCount"}, + "VolumeSizeInGB": 30, + } + + assert training_args["ResourceConfig"] == { + "InstanceCount": 1, + "InstanceType": {"Get": "Parameters.InstanceType"}, + "VolumeSizeInGB": 30, + } + assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { + "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri" + } + assert model_args["PrimaryContainer"]["ModelDataUrl"] == { + "Get": "Steps.my-train.ModelArtifacts.S3ModelArtifacts" + } + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_steps_with_map_params_pipeline( + sagemaker_session, + role, + script_dir, + pipeline_name, + region_name, + athena_dataset_definition, +): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + framework_version = "0.20.0" + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") + output_prefix = ParameterString(name="OutputPrefix", default_value="output") + input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" + + sklearn_processor = SKLearnProcessor( + framework_version=framework_version, + instance_type=instance_type, + instance_count=instance_count, + base_job_name="test-sklearn", + sagemaker_session=sagemaker_session, + role=role, + ) + step_process = ProcessingStep( + name="my-process", + display_name="ProcessingStep", + description="description for Processing step", + processor=sklearn_processor, + inputs=[ + ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), + ProcessingInput(dataset_definition=athena_dataset_definition), + ], + outputs=[ + ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), + ProcessingOutput( + output_name="test_data", + source="/opt/ml/processing/test", + destination=Join( + on="/", + values=[ + "s3:/", + sagemaker_session.default_bucket(), + "test-sklearn", + output_prefix, + ExecutionVariables.PIPELINE_EXECUTION_ID, + ], + ), + ), + ], + code=os.path.join(script_dir, "preprocessing.py"), + ) + + sklearn_train = SKLearn( + framework_version=framework_version, + entry_point=os.path.join(script_dir, "train.py"), + instance_type=instance_type, + sagemaker_session=sagemaker_session, + role=role, + hyperparameters={ + "batch-size": 500, + "epochs": 5, + }, + ) + step_train = TrainingStep( + name="my-train", + display_name="TrainingStep", + description="description for Training step", + estimator=sklearn_train, + inputs=TrainingInput( + s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ + "train_data" + ].S3Output.S3Uri + ), + ) + + model = Model( + image_uri=sklearn_train.image_uri, + model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, + sagemaker_session=sagemaker_session, + role=role, + ) + model_inputs = CreateModelInput( + instance_type="ml.m5.large", + accelerator_type="ml.eia1.medium", + ) + step_model = CreateModelStep( + name="my-model", + display_name="ModelStep", + description="description for Model step", + model=model, + inputs=model_inputs, + ) + + # Condition step for evaluating model quality and branching execution + cond_lte = ConditionGreaterThanOrEqualTo( + left=step_train.properties.HyperParameters["batch-size"], + right=6.0, + ) + + step_cond = ConditionStep( + name="CustomerChurnAccuracyCond", + conditions=[cond_lte], + if_steps=[], + else_steps=[step_model], + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_type, instance_count, output_prefix], + steps=[step_process, step_train, step_cond], + sagemaker_session=sagemaker_session, + ) + + definition = json.loads(pipeline.definition()) + assert definition["Version"] == "2020-12-01" + + steps = definition["Steps"] + assert len(steps) == 3 + training_args = {} + condition_args = {} + for step in steps: + if step["Type"] == "Training": + training_args = step["Arguments"] + if step["Type"] == "Condition": + condition_args = step["Arguments"] + + assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { + "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri" + } + assert condition_args["Conditions"][0]["LeftValue"] == { + "Get": "Steps.my-train.HyperParameters['batch-size']" + } + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_one_step_ingestion_pipeline( + sagemaker_session, feature_store_session, feature_definitions, role, pipeline_name +): + instance_count = ParameterInteger(name="InstanceCount", default_value=1) + instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") + + input_name = "features.csv" + input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv") + input_data_uri = os.path.join( + "s3://", + sagemaker_session.default_bucket(), + "py-sdk-ingestion-test-input/features.csv", + ) + + with open(input_file_path, "r") as data: + body = data.read() + S3Uploader.upload_string_as_file_body( + body=body, + desired_s3_uri=input_data_uri, + sagemaker_session=sagemaker_session, + ) + + inputs = [ + ProcessingInput( + input_name=input_name, + source=input_data_uri, + destination="/opt/ml/processing/features.csv", + ) + ] + + feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}" + feature_group = FeatureGroup( + name=feature_group_name, + feature_definitions=feature_definitions, + sagemaker_session=feature_store_session, + ) + + ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input( + input_name, + input_data_uri, + s3_content_type="csv", + s3_has_header=True, + ) + + outputs = [ + ProcessingOutput( + output_name=output_name, + app_managed=True, + feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name), + ) + ] + + output_content_type = "CSV" + output_config = {output_name: {"content_type": output_content_type}} + job_argument = [f"--output-config '{json.dumps(output_config)}'"] + + temp_flow_path = "./ingestion.flow" + with cleanup_feature_group(feature_group): + json.dump(ingestion_only_flow, open(temp_flow_path, "w")) + + data_wrangler_processor = DataWranglerProcessor( + role=role, + data_wrangler_flow_source=temp_flow_path, + instance_count=instance_count, + instance_type=instance_type, + sagemaker_session=sagemaker_session, + max_runtime_in_seconds=86400, + ) + + data_wrangler_step = ProcessingStep( + name="ingestion-step", + processor=data_wrangler_processor, + inputs=inputs, + outputs=outputs, + job_arguments=job_argument, + ) + + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count, instance_type], + steps=[data_wrangler_step], + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + + offline_store_s3_uri = os.path.join( + "s3://", sagemaker_session.default_bucket(), feature_group_name + ) + feature_group.create( + s3_uri=offline_store_s3_uri, + record_identifier_name="f11", + event_time_feature_name="f10", + role_arn=role, + enable_online_store=False, + ) + _wait_for_feature_group_create(feature_group) + + execution = pipeline.start() + response = execution.describe() + assert response["PipelineArn"] == create_arn + + try: + execution.wait(delay=60, max_attempts=10) + except WaiterError: + pass + + execution_steps = execution.list_steps() + + assert len(execution_steps) == 1 + assert execution_steps[0]["StepName"] == "ingestion-step" + assert execution_steps[0]["StepStatus"] == "Succeeded" + + athena_query = feature_group.athena_query() + with timeout(minutes=10): + athena_query.run( + query_string=f'SELECT * FROM "{athena_query.table_name}"', + output_location=f"{offline_store_s3_uri}/query_results", + ) + athena_query.wait() + assert "SUCCEEDED" == athena_query.get_query_execution().get("QueryExecution").get( + "Status" + ).get("State") + + df = athena_query.as_dataframe() + assert pd.read_csv(input_file_path).shape[0] == df.shape[0] + finally: + try: + pipeline.delete() + except Exception as e: + print(f"Delete pipeline failed with error: {e}") + os.remove(temp_flow_path) + + +@pytest.mark.skip( + reason="""This test creates a long-running pipeline that + runs actual training jobs, processing jobs, etc. + All of the functionality in this test is covered in + shallow tests in this suite; as such, this is disabled + and only run as part of the 'lineage' test suite.""" +) +def test_end_to_end_pipeline_successful_execution( + sagemaker_session, region_name, role, pipeline_name, wait=False +): + model_package_group_name = f"{pipeline_name}ModelPackageGroup" + data_path = os.path.join(DATA_DIR, "workflow") + default_bucket = sagemaker_session.default_bucket() + + # download the input data + local_input_path = os.path.join(data_path, "abalone-dataset.csv") + s3 = sagemaker_session.boto_session.resource("s3") + s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( + "dataset/abalone-dataset.csv", local_input_path + ) + + # # upload the input data to our bucket + base_uri = f"s3://{default_bucket}/{pipeline_name}" + with open(local_input_path) as data: + body = data.read() + input_data_uri = S3Uploader.upload_string_as_file_body( + body=body, + desired_s3_uri=f"{base_uri}/abalone-dataset.csv", + sagemaker_session=sagemaker_session, + ) + + # download batch transform data + local_batch_path = os.path.join(data_path, "abalone-dataset-batch") + s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( + "dataset/abalone-dataset-batch", local_batch_path + ) + + # upload the batch transform data + with open(local_batch_path) as data: + body = data.read() + batch_data_uri = S3Uploader.upload_string_as_file_body( + body=body, + desired_s3_uri=f"{base_uri}/abalone-dataset-batch", + sagemaker_session=sagemaker_session, + ) + + # define parameters + processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) + processing_instance_type = ParameterString( + name="ProcessingInstanceType", default_value="ml.m5.xlarge" + ) + training_instance_type = ParameterString( + name="TrainingInstanceType", default_value="ml.m5.xlarge" + ) + model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved") + input_data = ParameterString( + name="InputData", + default_value=input_data_uri, + ) + batch_data = ParameterString( + name="BatchData", + default_value=batch_data_uri, + ) + + # define processing step + framework_version = "0.23-1" + sklearn_processor = SKLearnProcessor( + framework_version=framework_version, + instance_type=processing_instance_type, + instance_count=processing_instance_count, + base_job_name=f"{pipeline_name}-process", + role=role, + sagemaker_session=sagemaker_session, + ) + step_process = ProcessingStep( + name="AbaloneProcess", + processor=sklearn_processor, + inputs=[ + ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), + ], + outputs=[ + ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), + ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), + ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), + ], + code=os.path.join(data_path, "abalone/preprocessing.py"), + ) + + # define training step + model_path = f"s3://{default_bucket}/{pipeline_name}Train" + image_uri = image_uris.retrieve( + framework="xgboost", + region=region_name, + version="1.0-1", + py_version="py3", + instance_type=training_instance_type, + ) + xgb_train = Estimator( + image_uri=image_uri, + instance_type=training_instance_type, + instance_count=1, + output_path=model_path, + role=role, + sagemaker_session=sagemaker_session, + ) + xgb_train.set_hyperparameters( + objective="reg:linear", + num_round=50, + max_depth=5, + eta=0.2, + gamma=4, + min_child_weight=6, + subsample=0.7, + silent=0, + ) + step_train = TrainingStep( + name="AbaloneTrain", + estimator=xgb_train, + inputs={ + "train": TrainingInput( + s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ + "train" + ].S3Output.S3Uri, + content_type="text/csv", + ), + "validation": TrainingInput( + s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ + "validation" + ].S3Output.S3Uri, + content_type="text/csv", + ), + }, + ) + + # define evaluation step + script_eval = ScriptProcessor( + image_uri=image_uri, + command=["python3"], + instance_type=processing_instance_type, + instance_count=1, + base_job_name=f"{pipeline_name}-eval", + role=role, + sagemaker_session=sagemaker_session, + ) + evaluation_report = PropertyFile( + name="EvaluationReport", output_name="evaluation", path="evaluation.json" + ) + step_eval = ProcessingStep( + name="AbaloneEval", + processor=script_eval, + inputs=[ + ProcessingInput( + source=step_train.properties.ModelArtifacts.S3ModelArtifacts, + destination="/opt/ml/processing/model", + ), + ProcessingInput( + source=step_process.properties.ProcessingOutputConfig.Outputs[ + "test" + ].S3Output.S3Uri, + destination="/opt/ml/processing/test", + ), + ], + outputs=[ + ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), + ], + code=os.path.join(data_path, "abalone/evaluation.py"), + property_files=[evaluation_report], + ) + + # define create model step + model = Model( + image_uri=image_uri, + model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, + sagemaker_session=sagemaker_session, + role=role, + ) + inputs = CreateModelInput( + instance_type="ml.m5.large", + accelerator_type="ml.eia1.medium", + ) + step_create_model = CreateModelStep( + name="AbaloneCreateModel", + model=model, + inputs=inputs, + ) + + # define transform step + transformer = Transformer( + model_name=step_create_model.properties.ModelName, + instance_type="ml.m5.xlarge", + instance_count=1, + output_path=f"s3://{default_bucket}/{pipeline_name}Transform", + sagemaker_session=sagemaker_session, + ) + step_transform = TransformStep( + name="AbaloneTransform", + transformer=transformer, + inputs=TransformInput(data=batch_data), + ) + + # define register model step + model_metrics = ModelMetrics( + model_statistics=MetricsSource( + s3_uri="{}/evaluation.json".format( + step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] + ), + content_type="application/json", + ) + ) + step_register = RegisterModel( + name="AbaloneRegisterModel", + estimator=xgb_train, + model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, + content_types=["text/csv"], + response_types=["text/csv"], + inference_instances=["ml.t2.medium", "ml.m5.xlarge"], + transform_instances=["ml.m5.xlarge"], + model_package_group_name=model_package_group_name, + approval_status=model_approval_status, + model_metrics=model_metrics, + ) + + # define condition step + cond_lte = ConditionLessThanOrEqualTo( + left=JsonGet( + step_name=step_eval.name, + property_file=evaluation_report, + json_path="regression_metrics.mse.value", + ), + right=20.0, + ) + + step_cond = ConditionStep( + name="AbaloneMSECond", + conditions=[cond_lte], + if_steps=[step_register, step_create_model, step_transform], + else_steps=[], + ) + + # define pipeline + pipeline = Pipeline( + name=pipeline_name, + parameters=[ + processing_instance_type, + processing_instance_count, + training_instance_type, + model_approval_status, + input_data, + batch_data, + ], + steps=[step_process, step_train, step_eval, step_cond], + sagemaker_session=sagemaker_session, + ) + + pipeline.create(role) + execution = pipeline.start() + execution_arn = execution.arn + + if wait: + execution.wait() + + return execution_arn + + +def _wait_for_feature_group_create(feature_group: FeatureGroup): + status = feature_group.describe().get("FeatureGroupStatus") + while status == "Creating": + print("Waiting for Feature Group Creation") + time.sleep(5) + status = feature_group.describe().get("FeatureGroupStatus") + if status != "Created": + print(feature_group.describe()) + raise RuntimeError(f"Failed to create feature group {feature_group.name}") + print(f"FeatureGroup {feature_group.name} successfully created.") + + +@pytest.fixture +def feature_definitions(): + return [ + FeatureDefinition(feature_name="f1", feature_type=FeatureTypeEnum.STRING), + FeatureDefinition(feature_name="f2", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="f3", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="f4", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="f5", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="f6", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="f7", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="f8", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="f9", feature_type=FeatureTypeEnum.INTEGRAL), + FeatureDefinition(feature_name="f10", feature_type=FeatureTypeEnum.FRACTIONAL), + FeatureDefinition(feature_name="f11", feature_type=FeatureTypeEnum.STRING), + ] + + +@contextmanager +def cleanup_feature_group(feature_group: FeatureGroup): + try: + yield + finally: + try: + feature_group.delete() + print("FeatureGroup cleaned up") + except Exception as e: + print(f"Delete FeatureGroup failed with error: {e}.") + pass + + +def test_large_pipeline(sagemaker_session, role, pipeline_name, region_name): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + + outputParam = CallbackOutput(output_name="output", output_type=CallbackOutputTypeEnum.String) + + callback_steps = [ + CallbackStep( + name=f"callback-step{count}", + sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", + inputs={"arg1": "foo"}, + outputs=[outputParam], + ) + for count in range(2000) + ] + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=callback_steps, + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + response = pipeline.describe() + assert len(json.loads(pipeline.describe()["PipelineDefinition"])["Steps"]) == 2000 + + pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] + response = pipeline.update(role) + update_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + update_arn, + ) + finally: + try: + pipeline.delete() + except Exception: + pass + + +def test_create_and_update_with_parallelism_config( + sagemaker_session, role, pipeline_name, region_name +): + instance_count = ParameterInteger(name="InstanceCount", default_value=2) + + outputParam = CallbackOutput(output_name="output", output_type=CallbackOutputTypeEnum.String) + + callback_steps = [ + CallbackStep( + name=f"callback-step{count}", + sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", + inputs={"arg1": "foo"}, + outputs=[outputParam], + ) + for count in range(500) + ] + pipeline = Pipeline( + name=pipeline_name, + parameters=[instance_count], + steps=callback_steps, + sagemaker_session=sagemaker_session, + ) + + try: + response = pipeline.create(role, parallelism_config={"MaxParallelExecutionSteps": 50}) + create_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + create_arn, + ) + response = pipeline.describe() + assert response["ParallelismConfiguration"]["MaxParallelExecutionSteps"] == 50 + + pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] + response = pipeline.update(role, parallelism_config={"MaxParallelExecutionSteps": 55}) + update_arn = response["PipelineArn"] + assert re.match( + rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", + update_arn, + ) + + response = pipeline.describe() + assert response["ParallelismConfiguration"]["MaxParallelExecutionSteps"] == 55 + + finally: + try: + pipeline.delete() + except Exception: + pass diff --git a/tests/integ/test_workflow_with_clarify.py b/tests/integ/sagemaker/workflow/test_workflow_with_clarify.py similarity index 100% rename from tests/integ/test_workflow_with_clarify.py rename to tests/integ/sagemaker/workflow/test_workflow_with_clarify.py diff --git a/tests/integ/test_workflow.py b/tests/integ/test_workflow.py deleted file mode 100644 index dd24149ca4..0000000000 --- a/tests/integ/test_workflow.py +++ /dev/null @@ -1,3012 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import json -import os -import re -import subprocess -import time -import uuid -import logging - -from contextlib import contextmanager -import pytest - -from botocore.exceptions import WaiterError -import pandas as pd - -import tests -from sagemaker.drift_check_baselines import DriftCheckBaselines -from tests.integ.timeout import timeout - -from sagemaker.debugger import ( - DebuggerHookConfig, - Rule, - rule_configs, -) -from datetime import datetime -from sagemaker.session import Session -from sagemaker import image_uris, PipelineModel -from sagemaker.estimator import Estimator -from sagemaker import FileSource, utils -from sagemaker.inputs import CreateModelInput, TrainingInput -from sagemaker.model import Model -from sagemaker.model_metrics import MetricsSource, ModelMetrics -from sagemaker.processing import ( - ProcessingInput, - ProcessingOutput, - FeatureStoreOutput, - ScriptProcessor, -) -from sagemaker.pytorch.estimator import PyTorch -from sagemaker.tuner import HyperparameterTuner, IntegerParameter -from sagemaker.s3 import S3Uploader -from sagemaker.session import get_execution_role -from sagemaker.sklearn.estimator import SKLearn -from sagemaker.sklearn import SKLearnModel -from sagemaker.transformer import Transformer -from sagemaker.mxnet.model import MXNetModel -from sagemaker.xgboost import XGBoostModel -from sagemaker.xgboost import XGBoost -from sagemaker.sklearn.processing import SKLearnProcessor -from sagemaker.spark.processing import PySparkProcessor, SparkJarProcessor -from sagemaker.workflow.conditions import ( - ConditionGreaterThanOrEqualTo, - ConditionIn, - ConditionLessThanOrEqualTo, -) -from sagemaker.workflow.condition_step import ConditionStep -from sagemaker.workflow.callback_step import ( - CallbackStep, - CallbackOutput, - CallbackOutputTypeEnum, -) -from sagemaker.workflow.lambda_step import ( - LambdaStep, - LambdaOutput, - LambdaOutputTypeEnum, -) -from sagemaker.workflow.emr_step import EMRStep, EMRStepConfig -from sagemaker.wrangler.processing import DataWranglerProcessor -from sagemaker.dataset_definition.inputs import ( - DatasetDefinition, - AthenaDatasetDefinition, -) -from sagemaker.workflow.execution_variables import ExecutionVariables -from sagemaker.workflow.functions import Join, JsonGet -from sagemaker.wrangler.ingestion import generate_data_ingestion_flow_from_s3_input -from sagemaker.workflow.parameters import ( - ParameterInteger, - ParameterString, -) -from sagemaker.workflow.steps import ( - CreateModelStep, - ProcessingStep, - TrainingStep, - CacheConfig, - TuningStep, - TransformStep, - TransformInput, - PropertyFile, -) -from sagemaker.workflow.step_collections import RegisterModel -from sagemaker.workflow.pipeline import Pipeline -from sagemaker.lambda_helper import Lambda -from sagemaker.feature_store.feature_group import ( - FeatureGroup, - FeatureDefinition, - FeatureTypeEnum, -) -from tests.integ import DATA_DIR -from tests.integ.kms_utils import get_or_create_kms_key -from tests.integ.retry import retries - - -def ordered(obj): - """Helper function for dict comparison""" - if isinstance(obj, dict): - return sorted((k, ordered(v)) for k, v in obj.items()) - if isinstance(obj, list): - return sorted(ordered(x) for x in obj) - else: - return obj - - -@pytest.fixture(scope="module") -def region_name(sagemaker_session): - return sagemaker_session.boto_session.region_name - - -@pytest.fixture(scope="module") -def role(sagemaker_session): - return get_execution_role(sagemaker_session) - - -@pytest.fixture(scope="module") -def script_dir(): - return os.path.join(DATA_DIR, "sklearn_processing") - - -@pytest.fixture(scope="module") -def feature_store_session(sagemaker_session): - boto_session = sagemaker_session.boto_session - sagemaker_client = boto_session.client("sagemaker") - featurestore_runtime_client = boto_session.client("sagemaker-featurestore-runtime") - - return Session( - boto_session=boto_session, - sagemaker_client=sagemaker_client, - sagemaker_featurestore_runtime_client=featurestore_runtime_client, - ) - - -@pytest.fixture -def pipeline_name(): - return f"my-pipeline-{int(time.time() * 10 ** 7)}" - - -@pytest.fixture -def athena_dataset_definition(sagemaker_session): - return DatasetDefinition( - local_path="/opt/ml/processing/input/add", - data_distribution_type="FullyReplicated", - input_mode="File", - athena_dataset_definition=AthenaDatasetDefinition( - catalog="AwsDataCatalog", - database="default", - work_group="workgroup", - query_string=('SELECT * FROM "default"."s3_test_table_$STAGE_$REGIONUNDERSCORED";'), - output_s3_uri=f"s3://{sagemaker_session.default_bucket()}/add", - output_format="JSON", - output_compression="GZIP", - ), - ) - - -@pytest.fixture -def configuration() -> list: - configuration = [ - { - "Classification": "spark-defaults", - "Properties": {"spark.executor.memory": "2g", "spark.executor.cores": "1"}, - }, - { - "Classification": "hadoop-env", - "Properties": {}, - "Configurations": [ - { - "Classification": "export", - "Properties": { - "HADOOP_DATANODE_HEAPSIZE": "2048", - "HADOOP_NAMENODE_OPTS": "-XX:GCTimeRatio=19", - }, - "Configurations": [], - } - ], - }, - { - "Classification": "core-site", - "Properties": {"spark.executor.memory": "2g", "spark.executor.cores": "1"}, - }, - {"Classification": "hadoop-log4j", "Properties": {"key": "value"}}, - { - "Classification": "hive-env", - "Properties": {}, - "Configurations": [ - { - "Classification": "export", - "Properties": { - "HADOOP_DATANODE_HEAPSIZE": "2048", - "HADOOP_NAMENODE_OPTS": "-XX:GCTimeRatio=19", - }, - "Configurations": [], - } - ], - }, - {"Classification": "hive-log4j", "Properties": {"key": "value"}}, - {"Classification": "hive-exec-log4j", "Properties": {"key": "value"}}, - {"Classification": "hive-site", "Properties": {"key": "value"}}, - {"Classification": "spark-defaults", "Properties": {"key": "value"}}, - { - "Classification": "spark-env", - "Properties": {}, - "Configurations": [ - { - "Classification": "export", - "Properties": { - "HADOOP_DATANODE_HEAPSIZE": "2048", - "HADOOP_NAMENODE_OPTS": "-XX:GCTimeRatio=19", - }, - "Configurations": [], - } - ], - }, - {"Classification": "spark-log4j", "Properties": {"key": "value"}}, - {"Classification": "spark-hive-site", "Properties": {"key": "value"}}, - {"Classification": "spark-metrics", "Properties": {"key": "value"}}, - {"Classification": "yarn-site", "Properties": {"key": "value"}}, - { - "Classification": "yarn-env", - "Properties": {}, - "Configurations": [ - { - "Classification": "export", - "Properties": { - "HADOOP_DATANODE_HEAPSIZE": "2048", - "HADOOP_NAMENODE_OPTS": "-XX:GCTimeRatio=19", - }, - "Configurations": [], - } - ], - }, - ] - return configuration - - -@pytest.fixture(scope="module") -def build_jar(): - spark_path = os.path.join(DATA_DIR, "spark") - java_file_path = os.path.join("com", "amazonaws", "sagemaker", "spark", "test") - java_version_pattern = r"(\d+\.\d+).*" - jar_file_path = os.path.join(spark_path, "code", "java", "hello-java-spark") - # compile java file - java_version = subprocess.check_output(["java", "-version"], stderr=subprocess.STDOUT).decode( - "utf-8" - ) - java_version = re.search(java_version_pattern, java_version).groups()[0] - - if float(java_version) > 1.8: - subprocess.run( - [ - "javac", - "--release", - "8", - os.path.join(jar_file_path, java_file_path, "HelloJavaSparkApp.java"), - ] - ) - else: - subprocess.run( - [ - "javac", - os.path.join(jar_file_path, java_file_path, "HelloJavaSparkApp.java"), - ] - ) - - subprocess.run( - [ - "jar", - "cfm", - os.path.join(jar_file_path, "hello-spark-java.jar"), - os.path.join(jar_file_path, "manifest.txt"), - "-C", - jar_file_path, - ".", - ] - ) - yield - subprocess.run(["rm", os.path.join(jar_file_path, "hello-spark-java.jar")]) - subprocess.run(["rm", os.path.join(jar_file_path, java_file_path, "HelloJavaSparkApp.class")]) - - -def test_three_step_definition( - sagemaker_session, - region_name, - role, - script_dir, - pipeline_name, - athena_dataset_definition, -): - framework_version = "0.20.0" - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - output_prefix = ParameterString(name="OutputPrefix", default_value="output") - - input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" - - sklearn_processor = SKLearnProcessor( - framework_version=framework_version, - instance_type=instance_type, - instance_count=instance_count, - base_job_name="test-sklearn", - sagemaker_session=sagemaker_session, - role=role, - ) - step_process = ProcessingStep( - name="my-process", - display_name="ProcessingStep", - description="description for Processing step", - processor=sklearn_processor, - inputs=[ - ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), - ProcessingInput(dataset_definition=athena_dataset_definition), - ], - outputs=[ - ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), - ProcessingOutput( - output_name="test_data", - source="/opt/ml/processing/test", - destination=Join( - on="/", - values=[ - "s3:/", - sagemaker_session.default_bucket(), - "test-sklearn", - output_prefix, - ExecutionVariables.PIPELINE_EXECUTION_ID, - ], - ), - ), - ], - code=os.path.join(script_dir, "preprocessing.py"), - ) - - sklearn_train = SKLearn( - framework_version=framework_version, - entry_point=os.path.join(script_dir, "train.py"), - instance_type=instance_type, - sagemaker_session=sagemaker_session, - role=role, - ) - step_train = TrainingStep( - name="my-train", - display_name="TrainingStep", - description="description for Training step", - estimator=sklearn_train, - inputs=TrainingInput( - s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ - "train_data" - ].S3Output.S3Uri - ), - ) - - model = Model( - image_uri=sklearn_train.image_uri, - model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, - sagemaker_session=sagemaker_session, - role=role, - ) - model_inputs = CreateModelInput( - instance_type="ml.m5.large", - accelerator_type="ml.eia1.medium", - ) - step_model = CreateModelStep( - name="my-model", - display_name="ModelStep", - description="description for Model step", - model=model, - inputs=model_inputs, - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_type, instance_count, output_prefix], - steps=[step_process, step_train, step_model], - sagemaker_session=sagemaker_session, - ) - - definition = json.loads(pipeline.definition()) - assert definition["Version"] == "2020-12-01" - - assert set(tuple(param.items()) for param in definition["Parameters"]) == set( - [ - tuple( - { - "Name": "InstanceType", - "Type": "String", - "DefaultValue": "ml.m5.xlarge", - }.items() - ), - tuple({"Name": "InstanceCount", "Type": "Integer", "DefaultValue": 1}.items()), - tuple( - { - "Name": "OutputPrefix", - "Type": "String", - "DefaultValue": "output", - }.items() - ), - ] - ) - - steps = definition["Steps"] - assert len(steps) == 3 - - names_and_types = [] - display_names_and_desc = [] - processing_args = {} - training_args = {} - for step in steps: - names_and_types.append((step["Name"], step["Type"])) - display_names_and_desc.append((step["DisplayName"], step["Description"])) - if step["Type"] == "Processing": - processing_args = step["Arguments"] - if step["Type"] == "Training": - training_args = step["Arguments"] - if step["Type"] == "Model": - model_args = step["Arguments"] - - assert set(names_and_types) == set( - [ - ("my-process", "Processing"), - ("my-train", "Training"), - ("my-model", "Model"), - ] - ) - - assert set(display_names_and_desc) == set( - [ - ("ProcessingStep", "description for Processing step"), - ("TrainingStep", "description for Training step"), - ("ModelStep", "description for Model step"), - ] - ) - assert processing_args["ProcessingResources"]["ClusterConfig"] == { - "InstanceType": {"Get": "Parameters.InstanceType"}, - "InstanceCount": {"Get": "Parameters.InstanceCount"}, - "VolumeSizeInGB": 30, - } - - assert training_args["ResourceConfig"] == { - "InstanceCount": 1, - "InstanceType": {"Get": "Parameters.InstanceType"}, - "VolumeSizeInGB": 30, - } - assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { - "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri" - } - assert model_args["PrimaryContainer"]["ModelDataUrl"] == { - "Get": "Steps.my-train.ModelArtifacts.S3ModelArtifacts" - } - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_one_step_sklearn_processing_pipeline( - sagemaker_session, - role, - sklearn_latest_version, - cpu_instance_type, - pipeline_name, - region_name, - athena_dataset_definition, -): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - script_path = os.path.join(DATA_DIR, "dummy_script.py") - input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") - inputs = [ - ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/"), - ProcessingInput(dataset_definition=athena_dataset_definition), - ] - - cache_config = CacheConfig(enable_caching=True, expire_after="T30m") - - sklearn_processor = SKLearnProcessor( - framework_version=sklearn_latest_version, - role=role, - instance_type=cpu_instance_type, - instance_count=instance_count, - command=["python3"], - sagemaker_session=sagemaker_session, - base_job_name="test-sklearn", - ) - - step_sklearn = ProcessingStep( - name="sklearn-process", - processor=sklearn_processor, - inputs=inputs, - code=script_path, - cache_config=cache_config, - ) - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=[step_sklearn], - sagemaker_session=sagemaker_session, - ) - - try: - # NOTE: We should exercise the case when role used in the pipeline execution is - # different than that required of the steps in the pipeline itself. The role in - # the pipeline definition needs to create training and processing jobs and other - # sagemaker entities. However, the jobs created in the steps themselves execute - # under a potentially different role, often requiring access to S3 and other - # artifacts not required to during creation of the jobs in the pipeline steps. - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - - response = execution.describe() - assert response["PipelineArn"] == create_arn - - # Check CacheConfig - response = json.loads(pipeline.describe()["PipelineDefinition"])["Steps"][0]["CacheConfig"] - assert response["Enabled"] == cache_config.enable_caching - assert response["ExpireAfter"] == cache_config.expire_after - - try: - execution.wait(delay=30, max_attempts=3) - except WaiterError: - pass - execution_steps = execution.list_steps() - assert len(execution_steps) == 1 - assert execution_steps[0]["StepName"] == "sklearn-process" - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_one_step_framework_processing_pipeline( - sagemaker_session, - role, - sklearn_latest_version, - cpu_instance_type, - pipeline_name, - region_name, - athena_dataset_definition, -): - """Use `SKLearnProcessor` to test `FrameworkProcessor`.""" - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - script_path = os.path.join(DATA_DIR, "dummy_script.py") - input_file_path = os.path.join(DATA_DIR, "dummy_input.txt") - - inputs = [ - ProcessingInput(source=input_file_path, destination="/opt/ml/processing/inputs/"), - ProcessingInput(dataset_definition=athena_dataset_definition), - ] - - cache_config = CacheConfig(enable_caching=True, expire_after="T30m") - - sklearn_processor = SKLearnProcessor( - framework_version=sklearn_latest_version, - role=role, - instance_type=cpu_instance_type, - instance_count=instance_count, - sagemaker_session=sagemaker_session, - base_job_name="test-sklearn", - ) - - run_args = sklearn_processor.get_run_args(code=script_path, inputs=inputs) - - step_sklearn = ProcessingStep( - name="sklearn-process", - processor=sklearn_processor, - inputs=run_args.inputs, - outputs=run_args.outputs, - job_arguments=run_args.arguments, - code=run_args.code, - cache_config=cache_config, - ) - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=[step_sklearn], - sagemaker_session=sagemaker_session, - ) - - try: - # NOTE: We should exercise the case when role used in the pipeline execution is - # different than that required of the steps in the pipeline itself. The role in - # the pipeline definition needs to create training and processing jobs and other - # sagemaker entities. However, the jobs created in the steps themselves execute - # under a potentially different role, often requiring access to S3 and other - # artifacts not required to during creation of the jobs in the pipeline steps. - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] - response = pipeline.update(role) - update_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - update_arn, - ) - - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - - response = execution.describe() - assert response["PipelineArn"] == create_arn - - # Check CacheConfig - response = json.loads(pipeline.describe()["PipelineDefinition"])["Steps"][0]["CacheConfig"] - assert response["Enabled"] == cache_config.enable_caching - assert response["ExpireAfter"] == cache_config.expire_after - - try: - execution.wait(delay=30, max_attempts=3) - except WaiterError: - pass - execution_steps = execution.list_steps() - assert len(execution_steps) == 1 - assert execution_steps[0]["StepName"] == "sklearn-process" - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_one_step_pyspark_processing_pipeline( - sagemaker_session, - role, - cpu_instance_type, - pipeline_name, - region_name, -): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - script_path = os.path.join(DATA_DIR, "dummy_script.py") - - cache_config = CacheConfig(enable_caching=True, expire_after="T30m") - - pyspark_processor = PySparkProcessor( - base_job_name="sm-spark", - framework_version="2.4", - role=role, - instance_count=instance_count, - instance_type=cpu_instance_type, - max_runtime_in_seconds=1200, - sagemaker_session=sagemaker_session, - ) - - spark_run_args = pyspark_processor.get_run_args( - submit_app=script_path, - arguments=[ - "--s3_input_bucket", - sagemaker_session.default_bucket(), - "--s3_input_key_prefix", - "spark-input", - "--s3_output_bucket", - sagemaker_session.default_bucket(), - "--s3_output_key_prefix", - "spark-output", - ], - ) - - step_pyspark = ProcessingStep( - name="pyspark-process", - processor=pyspark_processor, - inputs=spark_run_args.inputs, - outputs=spark_run_args.outputs, - job_arguments=spark_run_args.arguments, - code=spark_run_args.code, - cache_config=cache_config, - ) - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=[step_pyspark], - sagemaker_session=sagemaker_session, - ) - - try: - # NOTE: We should exercise the case when role used in the pipeline execution is - # different than that required of the steps in the pipeline itself. The role in - # the pipeline definition needs to create training and processing jobs and other - # sagemaker entities. However, the jobs created in the steps themselves execute - # under a potentially different role, often requiring access to S3 and other - # artifacts not required to during creation of the jobs in the pipeline steps. - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] - response = pipeline.update(role) - update_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - update_arn, - ) - - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - - response = execution.describe() - assert response["PipelineArn"] == create_arn - - # Check CacheConfig - response = json.loads(pipeline.describe()["PipelineDefinition"])["Steps"][0]["CacheConfig"] - assert response["Enabled"] == cache_config.enable_caching - assert response["ExpireAfter"] == cache_config.expire_after - - try: - execution.wait(delay=30, max_attempts=3) - except WaiterError: - pass - execution_steps = execution.list_steps() - assert len(execution_steps) == 1 - assert execution_steps[0]["StepName"] == "pyspark-process" - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_one_step_sparkjar_processing_pipeline( - sagemaker_session, - role, - cpu_instance_type, - pipeline_name, - region_name, - configuration, - build_jar, -): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - cache_config = CacheConfig(enable_caching=True, expire_after="T30m") - spark_path = os.path.join(DATA_DIR, "spark") - - spark_jar_processor = SparkJarProcessor( - role=role, - instance_count=2, - instance_type=cpu_instance_type, - sagemaker_session=sagemaker_session, - framework_version="2.4", - ) - bucket = spark_jar_processor.sagemaker_session.default_bucket() - with open(os.path.join(spark_path, "files", "data.jsonl")) as data: - body = data.read() - input_data_uri = f"s3://{bucket}/spark/input/data.jsonl" - S3Uploader.upload_string_as_file_body( - body=body, - desired_s3_uri=input_data_uri, - sagemaker_session=sagemaker_session, - ) - output_data_uri = f"s3://{bucket}/spark/output/sales/{datetime.now().isoformat()}" - - java_project_dir = os.path.join(spark_path, "code", "java", "hello-java-spark") - spark_run_args = spark_jar_processor.get_run_args( - submit_app=f"{java_project_dir}/hello-spark-java.jar", - submit_class="com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp", - arguments=["--input", input_data_uri, "--output", output_data_uri], - configuration=configuration, - ) - - step_pyspark = ProcessingStep( - name="sparkjar-process", - processor=spark_jar_processor, - inputs=spark_run_args.inputs, - outputs=spark_run_args.outputs, - job_arguments=spark_run_args.arguments, - code=spark_run_args.code, - cache_config=cache_config, - ) - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=[step_pyspark], - sagemaker_session=sagemaker_session, - ) - - try: - # NOTE: We should exercise the case when role used in the pipeline execution is - # different than that required of the steps in the pipeline itself. The role in - # the pipeline definition needs to create training and processing jobs and other - # sagemaker entities. However, the jobs created in the steps themselves execute - # under a potentially different role, often requiring access to S3 and other - # artifacts not required to during creation of the jobs in the pipeline steps. - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] - response = pipeline.update(role) - update_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - update_arn, - ) - - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - - response = execution.describe() - assert response["PipelineArn"] == create_arn - - # Check CacheConfig - response = json.loads(pipeline.describe()["PipelineDefinition"])["Steps"][0]["CacheConfig"] - assert response["Enabled"] == cache_config.enable_caching - assert response["ExpireAfter"] == cache_config.expire_after - - try: - execution.wait(delay=30, max_attempts=3) - except WaiterError: - pass - execution_steps = execution.list_steps() - assert len(execution_steps) == 1 - assert execution_steps[0]["StepName"] == "sparkjar-process" - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_one_step_callback_pipeline(sagemaker_session, role, pipeline_name, region_name): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - - outputParam1 = CallbackOutput(output_name="output1", output_type=CallbackOutputTypeEnum.String) - step_callback = CallbackStep( - name="callback-step", - sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", - inputs={"arg1": "foo"}, - outputs=[outputParam1], - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=[step_callback], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] - response = pipeline.update(role) - update_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - update_arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_steps_with_map_params_pipeline( - sagemaker_session, - role, - script_dir, - pipeline_name, - region_name, - athena_dataset_definition, -): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - framework_version = "0.20.0" - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - output_prefix = ParameterString(name="OutputPrefix", default_value="output") - input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" - - sklearn_processor = SKLearnProcessor( - framework_version=framework_version, - instance_type=instance_type, - instance_count=instance_count, - base_job_name="test-sklearn", - sagemaker_session=sagemaker_session, - role=role, - ) - step_process = ProcessingStep( - name="my-process", - display_name="ProcessingStep", - description="description for Processing step", - processor=sklearn_processor, - inputs=[ - ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), - ProcessingInput(dataset_definition=athena_dataset_definition), - ], - outputs=[ - ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), - ProcessingOutput( - output_name="test_data", - source="/opt/ml/processing/test", - destination=Join( - on="/", - values=[ - "s3:/", - sagemaker_session.default_bucket(), - "test-sklearn", - output_prefix, - ExecutionVariables.PIPELINE_EXECUTION_ID, - ], - ), - ), - ], - code=os.path.join(script_dir, "preprocessing.py"), - ) - - sklearn_train = SKLearn( - framework_version=framework_version, - entry_point=os.path.join(script_dir, "train.py"), - instance_type=instance_type, - sagemaker_session=sagemaker_session, - role=role, - hyperparameters={ - "batch-size": 500, - "epochs": 5, - }, - ) - step_train = TrainingStep( - name="my-train", - display_name="TrainingStep", - description="description for Training step", - estimator=sklearn_train, - inputs=TrainingInput( - s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ - "train_data" - ].S3Output.S3Uri - ), - ) - - model = Model( - image_uri=sklearn_train.image_uri, - model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, - sagemaker_session=sagemaker_session, - role=role, - ) - model_inputs = CreateModelInput( - instance_type="ml.m5.large", - accelerator_type="ml.eia1.medium", - ) - step_model = CreateModelStep( - name="my-model", - display_name="ModelStep", - description="description for Model step", - model=model, - inputs=model_inputs, - ) - - # Condition step for evaluating model quality and branching execution - cond_lte = ConditionGreaterThanOrEqualTo( - left=step_train.properties.HyperParameters["batch-size"], - right=6.0, - ) - - step_cond = ConditionStep( - name="CustomerChurnAccuracyCond", - conditions=[cond_lte], - if_steps=[], - else_steps=[step_model], - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_type, instance_count, output_prefix], - steps=[step_process, step_train, step_cond], - sagemaker_session=sagemaker_session, - ) - - definition = json.loads(pipeline.definition()) - assert definition["Version"] == "2020-12-01" - - steps = definition["Steps"] - assert len(steps) == 3 - training_args = {} - condition_args = {} - for step in steps: - if step["Type"] == "Training": - training_args = step["Arguments"] - if step["Type"] == "Condition": - condition_args = step["Arguments"] - - assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == { - "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri" - } - assert condition_args["Conditions"][0]["LeftValue"] == { - "Get": "Steps.my-train.HyperParameters['batch-size']" - } - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_two_step_callback_pipeline_with_output_reference( - sagemaker_session, role, pipeline_name, region_name -): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - - outputParam1 = CallbackOutput(output_name="output1", output_type=CallbackOutputTypeEnum.String) - step_callback1 = CallbackStep( - name="callback-step1", - sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", - inputs={"arg1": "foo"}, - outputs=[outputParam1], - ) - - step_callback2 = CallbackStep( - name="callback-step2", - sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", - inputs={"arg1": outputParam1}, - outputs=[], - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=[step_callback1, step_callback2], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_one_step_lambda_pipeline(sagemaker_session, role, pipeline_name, region_name): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - - outputParam1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String) - step_lambda = LambdaStep( - name="lambda-step", - lambda_func=Lambda( - function_arn=("arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda"), - session=sagemaker_session, - ), - inputs={"arg1": "foo"}, - outputs=[outputParam1], - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=[step_lambda], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] - response = pipeline.update(role) - update_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - update_arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_two_step_lambda_pipeline_with_output_reference( - sagemaker_session, role, pipeline_name, region_name -): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - - outputParam1 = LambdaOutput(output_name="output1", output_type=LambdaOutputTypeEnum.String) - step_lambda1 = LambdaStep( - name="lambda-step1", - lambda_func=Lambda( - function_arn=("arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda"), - session=sagemaker_session, - ), - inputs={"arg1": "foo"}, - outputs=[outputParam1], - ) - - step_lambda2 = LambdaStep( - name="lambda-step2", - lambda_func=Lambda( - function_arn=("arn:aws:lambda:us-west-2:123456789012:function:sagemaker_test_lambda"), - session=sagemaker_session, - ), - inputs={"arg1": outputParam1}, - outputs=[], - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=[step_lambda1, step_lambda2], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_two_steps_emr_pipeline(sagemaker_session, role, pipeline_name, region_name): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - - emr_step_config = EMRStepConfig( - jar="s3://us-west-2.elasticmapreduce/libs/script-runner/script-runner.jar", - args=["dummy_emr_script_path"], - ) - - step_emr_1 = EMRStep( - name="emr-step-1", - cluster_id="j-1YONHTCP3YZKC", - display_name="emr_step_1", - description="MyEMRStepDescription", - step_config=emr_step_config, - ) - - step_emr_2 = EMRStep( - name="emr-step-2", - cluster_id=step_emr_1.properties.ClusterId, - display_name="emr_step_2", - description="MyEMRStepDescription", - step_config=emr_step_config, - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=[step_emr_1, step_emr_2], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_conditional_pytorch_training_model_registration( - sagemaker_session, - role, - cpu_instance_type, - pipeline_name, - region_name, -): - base_dir = os.path.join(DATA_DIR, "pytorch_mnist") - entry_point = os.path.join(base_dir, "mnist.py") - input_path = sagemaker_session.upload_data( - path=os.path.join(base_dir, "training"), - key_prefix="integ-test-data/pytorch_mnist/training", - ) - inputs = TrainingInput(s3_data=input_path) - - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - good_enough_input = ParameterInteger(name="GoodEnoughInput", default_value=1) - in_condition_input = ParameterString(name="Foo", default_value="Foo") - - pytorch_estimator = PyTorch( - entry_point=entry_point, - role=role, - framework_version="1.5.0", - py_version="py3", - instance_count=instance_count, - instance_type=instance_type, - sagemaker_session=sagemaker_session, - ) - step_train = TrainingStep( - name="pytorch-train", - estimator=pytorch_estimator, - inputs=inputs, - ) - - step_register = RegisterModel( - name="pytorch-register-model", - estimator=pytorch_estimator, - model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, - content_types=["*"], - response_types=["*"], - inference_instances=["*"], - transform_instances=["*"], - description="test-description", - ) - - model = Model( - image_uri=pytorch_estimator.training_image_uri(), - model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, - sagemaker_session=sagemaker_session, - role=role, - ) - model_inputs = CreateModelInput( - instance_type="ml.m5.large", - accelerator_type="ml.eia1.medium", - ) - step_model = CreateModelStep( - name="pytorch-model", - model=model, - inputs=model_inputs, - ) - - step_cond = ConditionStep( - name="cond-good-enough", - conditions=[ - ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1), - ConditionIn(value=in_condition_input, in_values=["foo", "bar"]), - ], - if_steps=[step_train, step_register], - else_steps=[step_model], - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[ - in_condition_input, - good_enough_input, - instance_count, - instance_type, - ], - steps=[step_cond], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - - execution = pipeline.start(parameters={"GoodEnoughInput": 0}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_tuning_single_algo( - sagemaker_session, - role, - cpu_instance_type, - pipeline_name, - region_name, -): - base_dir = os.path.join(DATA_DIR, "pytorch_mnist") - entry_point = os.path.join(base_dir, "mnist.py") - input_path = sagemaker_session.upload_data( - path=os.path.join(base_dir, "training"), - key_prefix="integ-test-data/pytorch_mnist/training", - ) - inputs = TrainingInput(s3_data=input_path) - - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - - pytorch_estimator = PyTorch( - entry_point=entry_point, - role=role, - framework_version="1.5.0", - py_version="py3", - instance_count=instance_count, - instance_type=instance_type, - sagemaker_session=sagemaker_session, - enable_sagemaker_metrics=True, - max_retry_attempts=3, - ) - - min_batch_size = ParameterString(name="MinBatchSize", default_value="64") - max_batch_size = ParameterString(name="MaxBatchSize", default_value="128") - hyperparameter_ranges = { - "batch-size": IntegerParameter(min_batch_size, max_batch_size), - } - - tuner = HyperparameterTuner( - estimator=pytorch_estimator, - objective_metric_name="test:acc", - objective_type="Maximize", - hyperparameter_ranges=hyperparameter_ranges, - metric_definitions=[{"Name": "test:acc", "Regex": "Overall test accuracy: (.*?);"}], - max_jobs=2, - max_parallel_jobs=2, - ) - - step_tune = TuningStep( - name="my-tuning-step", - tuner=tuner, - inputs=inputs, - ) - - best_model = Model( - image_uri=pytorch_estimator.training_image_uri(), - model_data=step_tune.get_top_model_s3_uri( - top_k=0, - s3_bucket=sagemaker_session.default_bucket(), - ), - sagemaker_session=sagemaker_session, - role=role, - ) - model_inputs = CreateModelInput( - instance_type="ml.m5.large", - accelerator_type="ml.eia1.medium", - ) - step_best_model = CreateModelStep( - name="1st-model", - model=best_model, - inputs=model_inputs, - ) - - second_best_model = Model( - image_uri=pytorch_estimator.training_image_uri(), - model_data=step_tune.get_top_model_s3_uri( - top_k=1, - s3_bucket=sagemaker_session.default_bucket(), - ), - sagemaker_session=sagemaker_session, - role=role, - ) - - step_second_best_model = CreateModelStep( - name="2nd-best-model", - model=second_best_model, - inputs=model_inputs, - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count, instance_type, min_batch_size, max_batch_size], - steps=[step_tune, step_best_model, step_second_best_model], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_tuning_multi_algos( - sagemaker_session, - role, - cpu_instance_type, - pipeline_name, - region_name, - script_dir, - athena_dataset_definition, -): - base_dir = os.path.join(DATA_DIR, "pytorch_mnist") - entry_point = os.path.join(base_dir, "mnist.py") - input_path = sagemaker_session.upload_data( - path=os.path.join(base_dir, "training"), - key_prefix="integ-test-data/pytorch_mnist/training", - ) - - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - - input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" - - sklearn_processor = SKLearnProcessor( - framework_version="0.20.0", - instance_type=instance_type, - instance_count=instance_count, - base_job_name="test-sklearn", - sagemaker_session=sagemaker_session, - role=role, - ) - - property_file = PropertyFile( - name="DataAttributes", output_name="attributes", path="attributes.json" - ) - - step_process = ProcessingStep( - name="my-process", - display_name="ProcessingStep", - description="description for Processing step", - processor=sklearn_processor, - inputs=[ - ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), - ProcessingInput(dataset_definition=athena_dataset_definition), - ], - outputs=[ - ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), - ProcessingOutput(output_name="attributes", source="/opt/ml/processing/attributes.json"), - ], - property_files=[property_file], - code=os.path.join(script_dir, "preprocessing.py"), - ) - - static_hp_1 = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - json_get_hp = JsonGet( - step_name=step_process.name, property_file=property_file, json_path="train_size" - ) - pytorch_estimator = PyTorch( - entry_point=entry_point, - role=role, - framework_version="1.5.0", - py_version="py3", - instance_count=instance_count, - instance_type=instance_type, - sagemaker_session=sagemaker_session, - enable_sagemaker_metrics=True, - max_retry_attempts=3, - hyperparameters={"static-hp": static_hp_1, "train_size": json_get_hp}, - ) - - min_batch_size = ParameterString(name="MinBatchSize", default_value="64") - max_batch_size = json_get_hp - - tuner = HyperparameterTuner.create( - estimator_dict={ - "estimator-1": pytorch_estimator, - "estimator-2": pytorch_estimator, - }, - objective_metric_name_dict={ - "estimator-1": "test:acc", - "estimator-2": "test:acc", - }, - hyperparameter_ranges_dict={ - "estimator-1": {"batch-size": IntegerParameter(min_batch_size, max_batch_size)}, - "estimator-2": {"batch-size": IntegerParameter(min_batch_size, max_batch_size)}, - }, - metric_definitions_dict={ - "estimator-1": [{"Name": "test:acc", "Regex": "Overall test accuracy: (.*?);"}], - "estimator-2": [{"Name": "test:acc", "Regex": "Overall test accuracy: (.*?);"}], - }, - ) - - inputs = { - "estimator-1": TrainingInput(s3_data=input_path), - "estimator-2": TrainingInput(s3_data=input_path), - } - - step_tune = TuningStep( - name="my-tuning-step", - tuner=tuner, - inputs=inputs, - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count, instance_type, min_batch_size, max_batch_size], - steps=[step_process, step_tune], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_mxnet_model_registration( - sagemaker_session, - role, - cpu_instance_type, - pipeline_name, - region_name, -): - base_dir = os.path.join(DATA_DIR, "mxnet_mnist") - source_dir = os.path.join(base_dir, "code") - entry_point = os.path.join(source_dir, "inference.py") - mx_mnist_model_data = os.path.join(base_dir, "model.tar.gz") - - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - - model = MXNetModel( - entry_point=entry_point, - source_dir=source_dir, - role=role, - model_data=mx_mnist_model_data, - framework_version="1.7.0", - py_version="py3", - sagemaker_session=sagemaker_session, - ) - - step_register = RegisterModel( - name="mxnet-register-model", - model=model, - content_types=["*"], - response_types=["*"], - inference_instances=["ml.m5.xlarge"], - transform_instances=["*"], - description="test-description", - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count, instance_type], - steps=[step_register], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - - execution = pipeline.start() - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_sklearn_xgboost_sip_model_registration( - sagemaker_session, role, pipeline_name, region_name -): - prefix = "sip" - bucket_name = sagemaker_session.default_bucket() - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - - sklearn_processor = SKLearnProcessor( - role=role, - instance_type=instance_type, - instance_count=instance_count, - framework_version="0.20.0", - sagemaker_session=sagemaker_session, - ) - - # The path to the raw data. - raw_data_path = "s3://{0}/{1}/data/raw/".format(bucket_name, prefix) - raw_data_path_param = ParameterString(name="raw_data_path", default_value=raw_data_path) - - # The output path to the training data. - train_data_path = "s3://{0}/{1}/data/preprocessed/train/".format(bucket_name, prefix) - train_data_path_param = ParameterString(name="train_data_path", default_value=train_data_path) - - # The output path to the validation data. - val_data_path = "s3://{0}/{1}/data/preprocessed/val/".format(bucket_name, prefix) - val_data_path_param = ParameterString(name="val_data_path", default_value=val_data_path) - - # The training output path for the model. - output_path = "s3://{0}/{1}/output/".format(bucket_name, prefix) - output_path_param = ParameterString(name="output_path", default_value=output_path) - - # The output path to the featurizer model. - model_path = "s3://{0}/{1}/output/sklearn/".format(bucket_name, prefix) - model_path_param = ParameterString(name="model_path", default_value=model_path) - - inputs = [ - ProcessingInput( - input_name="raw_data", - source=raw_data_path_param, - destination="/opt/ml/processing/input", - ) - ] - - outputs = [ - ProcessingOutput( - output_name="train_data", - source="/opt/ml/processing/train", - destination=train_data_path_param, - ), - ProcessingOutput( - output_name="val_data", - source="/opt/ml/processing/val", - destination=val_data_path_param, - ), - ProcessingOutput( - output_name="model", - source="/opt/ml/processing/model", - destination=model_path_param, - ), - ] - - base_dir = os.path.join(DATA_DIR, "sip") - code_path = os.path.join(base_dir, "preprocessor.py") - - processing_step = ProcessingStep( - name="Processing", - code=code_path, - processor=sklearn_processor, - inputs=inputs, - outputs=outputs, - job_arguments=["--train-test-split-ratio", "0.2"], - ) - - entry_point = "training.py" - source_dir = base_dir - code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) - - estimator = XGBoost( - entry_point=entry_point, - source_dir=source_dir, - output_path=output_path_param, - code_location=code_location, - instance_type=instance_type, - instance_count=instance_count, - framework_version="0.90-2", - sagemaker_session=sagemaker_session, - py_version="py3", - role=role, - ) - - training_step = TrainingStep( - name="Training", - estimator=estimator, - inputs={ - "train": TrainingInput( - s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[ - "train_data" - ].S3Output.S3Uri, - content_type="text/csv", - ), - "validation": TrainingInput( - s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[ - "val_data" - ].S3Output.S3Uri, - content_type="text/csv", - ), - }, - ) - - code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) - source_dir = os.path.join(base_dir, "sklearn_source_dir") - - sklearn_model = SKLearnModel( - name="sklearn-model", - model_data=processing_step.properties.ProcessingOutputConfig.Outputs[ - "model" - ].S3Output.S3Uri, - entry_point="inference.py", - source_dir=source_dir, - code_location=code_location, - role=role, - sagemaker_session=sagemaker_session, - framework_version="0.20.0", - py_version="py3", - ) - - code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) - source_dir = os.path.join(base_dir, "xgboost_source_dir") - - xgboost_model = XGBoostModel( - name="xgboost-model", - model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, - entry_point="inference.py", - source_dir=source_dir, - code_location=code_location, - framework_version="0.90-2", - py_version="py3", - role=role, - sagemaker_session=sagemaker_session, - ) - - pipeline_model = PipelineModel( - [xgboost_model, sklearn_model], role, sagemaker_session=sagemaker_session - ) - - step_register = RegisterModel( - name="AbaloneRegisterModel", - model=pipeline_model, - content_types=["application/json"], - response_types=["application/json"], - inference_instances=["ml.t2.medium", "ml.m5.xlarge"], - transform_instances=["ml.m5.xlarge"], - model_package_group_name="windturbine", - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[ - raw_data_path_param, - train_data_path_param, - val_data_path_param, - model_path_param, - instance_type, - instance_count, - output_path_param, - ], - steps=[processing_step, training_step, step_register], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.upsert(role_arn=role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - - execution = pipeline.start() - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -@pytest.mark.skipif( - tests.integ.test_region() not in tests.integ.DRIFT_CHECK_BASELINES_SUPPORTED_REGIONS, - reason=( - "DriftCheckBaselines changes are not fully deployed in" f" {tests.integ.test_region()}." - ), -) -def test_model_registration_with_drift_check_baselines( - sagemaker_session, - role, - pipeline_name, -): - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - - # upload model data to s3 - model_local_path = os.path.join(DATA_DIR, "mxnet_mnist/model.tar.gz") - model_base_uri = "s3://{}/{}/input/model/{}".format( - sagemaker_session.default_bucket(), - "register_model_test_with_drift_baseline", - utils.unique_name_from_base("model"), - ) - model_uri = S3Uploader.upload( - model_local_path, model_base_uri, sagemaker_session=sagemaker_session - ) - model_uri_param = ParameterString(name="model_uri", default_value=model_uri) - - # upload metrics to s3 - metrics_data = ( - '{"regression_metrics": {"mse": {"value": 4.925353410353891, ' - '"standard_deviation": 2.219186917819692}}}' - ) - metrics_base_uri = "s3://{}/{}/input/metrics/{}".format( - sagemaker_session.default_bucket(), - "register_model_test_with_drift_baseline", - utils.unique_name_from_base("metrics"), - ) - metrics_uri = S3Uploader.upload_string_as_file_body( - body=metrics_data, - desired_s3_uri=metrics_base_uri, - sagemaker_session=sagemaker_session, - ) - metrics_uri_param = ParameterString(name="metrics_uri", default_value=metrics_uri) - - model_metrics = ModelMetrics( - bias=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - explainability=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - bias_pre_training=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - bias_post_training=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - ) - drift_check_baselines = DriftCheckBaselines( - model_statistics=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - model_constraints=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - model_data_statistics=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - model_data_constraints=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - bias_config_file=FileSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - bias_pre_training_constraints=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - bias_post_training_constraints=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - explainability_constraints=MetricsSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - explainability_config_file=FileSource( - s3_uri=metrics_uri_param, - content_type="application/json", - ), - ) - customer_metadata_properties = {"key1": "value1"} - estimator = XGBoost( - entry_point="training.py", - source_dir=os.path.join(DATA_DIR, "sip"), - instance_type=instance_type, - instance_count=instance_count, - framework_version="0.90-2", - sagemaker_session=sagemaker_session, - py_version="py3", - role=role, - ) - step_register = RegisterModel( - name="MyRegisterModelStep", - estimator=estimator, - model_data=model_uri_param, - content_types=["application/json"], - response_types=["application/json"], - inference_instances=["ml.t2.medium", "ml.m5.xlarge"], - transform_instances=["ml.m5.xlarge"], - model_package_group_name="testModelPackageGroup", - model_metrics=model_metrics, - drift_check_baselines=drift_check_baselines, - customer_metadata_properties=customer_metadata_properties, - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[ - model_uri_param, - metrics_uri_param, - instance_type, - instance_count, - ], - steps=[step_register], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - - for _ in retries( - max_retry_count=5, - exception_message_prefix="Waiting for a successful execution of pipeline", - seconds_to_sleep=10, - ): - execution = pipeline.start( - parameters={"model_uri": model_uri, "metrics_uri": metrics_uri} - ) - response = execution.describe() - - assert response["PipelineArn"] == create_arn - - try: - execution.wait(delay=30, max_attempts=60) - except WaiterError: - pass - execution_steps = execution.list_steps() - - assert len(execution_steps) == 1 - failure_reason = execution_steps[0].get("FailureReason", "") - if failure_reason != "": - logging.error( - f"Pipeline execution failed with error: {failure_reason}." " Retrying.." - ) - continue - assert execution_steps[0]["StepStatus"] == "Succeeded" - assert execution_steps[0]["StepName"] == "MyRegisterModelStep" - - response = sagemaker_session.sagemaker_client.describe_model_package( - ModelPackageName=execution_steps[0]["Metadata"]["RegisterModel"]["Arn"] - ) - - assert ( - response["ModelMetrics"]["Explainability"]["Report"]["ContentType"] - == "application/json" - ) - assert ( - response["DriftCheckBaselines"]["Bias"]["PreTrainingConstraints"]["ContentType"] - == "application/json" - ) - assert ( - response["DriftCheckBaselines"]["Explainability"]["Constraints"]["ContentType"] - == "application/json" - ) - assert ( - response["DriftCheckBaselines"]["ModelQuality"]["Statistics"]["ContentType"] - == "application/json" - ) - assert ( - response["DriftCheckBaselines"]["ModelDataQuality"]["Statistics"]["ContentType"] - == "application/json" - ) - assert response["CustomerMetadataProperties"] == customer_metadata_properties - break - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_model_registration_with_model_repack( - sagemaker_session, - role, - pipeline_name, - region_name, -): - kms_key = get_or_create_kms_key(sagemaker_session, role) - base_dir = os.path.join(DATA_DIR, "pytorch_mnist") - entry_point = os.path.join(base_dir, "mnist.py") - input_path = sagemaker_session.upload_data( - path=os.path.join(base_dir, "training"), - key_prefix="integ-test-data/pytorch_mnist/training", - ) - inputs = TrainingInput(s3_data=input_path) - - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - good_enough_input = ParameterInteger(name="GoodEnoughInput", default_value=1) - - pytorch_estimator = PyTorch( - entry_point=entry_point, - role=role, - framework_version="1.5.0", - py_version="py3", - instance_count=instance_count, - instance_type=instance_type, - sagemaker_session=sagemaker_session, - output_kms_key=kms_key, - ) - step_train = TrainingStep( - name="pytorch-train", - estimator=pytorch_estimator, - inputs=inputs, - ) - - step_register = RegisterModel( - name="pytorch-register-model", - estimator=pytorch_estimator, - model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, - content_types=["text/csv"], - response_types=["text/csv"], - inference_instances=["ml.t2.medium", "ml.m5.large"], - transform_instances=["ml.m5.large"], - description="test-description", - entry_point=entry_point, - model_kms_key=kms_key, - ) - - model = Model( - image_uri=pytorch_estimator.training_image_uri(), - model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, - sagemaker_session=sagemaker_session, - role=role, - ) - model_inputs = CreateModelInput( - instance_type="ml.m5.large", - accelerator_type="ml.eia1.medium", - ) - step_model = CreateModelStep( - name="pytorch-model", - model=model, - inputs=model_inputs, - ) - - step_cond = ConditionStep( - name="cond-good-enough", - conditions=[ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1)], - if_steps=[step_train, step_register], - else_steps=[step_model], - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[good_enough_input, instance_count, instance_type], - steps=[step_cond], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - - execution = pipeline.start(parameters={"GoodEnoughInput": 0}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_training_job_with_debugger_and_profiler( - sagemaker_session, - pipeline_name, - role, - pytorch_training_latest_version, - pytorch_training_latest_py_version, -): - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") - - rules = [ - Rule.sagemaker(rule_configs.vanishing_gradient()), - Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), - Rule.sagemaker(rule_configs.loss_not_decreasing()), - ] - debugger_hook_config = DebuggerHookConfig( - s3_output_path=(f"s3://{sagemaker_session.default_bucket()}/{uuid.uuid4()}/tensors") - ) - - base_dir = os.path.join(DATA_DIR, "pytorch_mnist") - script_path = os.path.join(base_dir, "mnist.py") - input_path = sagemaker_session.upload_data( - path=os.path.join(base_dir, "training"), - key_prefix="integ-test-data/pytorch_mnist/training", - ) - inputs = TrainingInput(s3_data=input_path) - - pytorch_estimator = PyTorch( - entry_point=script_path, - role="SageMakerRole", - framework_version=pytorch_training_latest_version, - py_version=pytorch_training_latest_py_version, - instance_count=instance_count, - instance_type=instance_type, - sagemaker_session=sagemaker_session, - rules=rules, - debugger_hook_config=debugger_hook_config, - ) - - step_train = TrainingStep( - name="pytorch-train", - estimator=pytorch_estimator, - inputs=inputs, - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count, instance_type], - steps=[step_train], - sagemaker_session=sagemaker_session, - ) - - for _ in retries( - max_retry_count=5, - exception_message_prefix="Waiting for a successful execution of pipeline", - seconds_to_sleep=10, - ): - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - - execution = pipeline.start() - response = execution.describe() - assert response["PipelineArn"] == create_arn - - try: - execution.wait(delay=10, max_attempts=60) - except WaiterError: - pass - execution_steps = execution.list_steps() - - assert len(execution_steps) == 1 - failure_reason = execution_steps[0].get("FailureReason", "") - if failure_reason != "": - logging.error(f"Pipeline execution failed with error: {failure_reason}.Retrying..") - continue - assert execution_steps[0]["StepName"] == "pytorch-train" - assert execution_steps[0]["StepStatus"] == "Succeeded" - - training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"] - job_description = sagemaker_session.sagemaker_client.describe_training_job( - TrainingJobName=training_job_arn.split("/")[1] - ) - - for index, rule in enumerate(rules): - config = job_description["DebugRuleConfigurations"][index] - assert config["RuleConfigurationName"] == rule.name - assert config["RuleEvaluatorImage"] == rule.image_uri - assert config["VolumeSizeInGB"] == 0 - assert ( - config["RuleParameters"]["rule_to_invoke"] - == rule.rule_parameters["rule_to_invoke"] - ) - assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict() - - assert job_description["ProfilingStatus"] == "Enabled" - assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500 - break - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_two_processing_job_depends_on( - sagemaker_session, - role, - pipeline_name, - region_name, - cpu_instance_type, -): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - script_path = os.path.join(DATA_DIR, "dummy_script.py") - - pyspark_processor = PySparkProcessor( - base_job_name="sm-spark", - framework_version="2.4", - role=role, - instance_count=instance_count, - instance_type=cpu_instance_type, - max_runtime_in_seconds=1200, - sagemaker_session=sagemaker_session, - ) - - spark_run_args = pyspark_processor.get_run_args( - submit_app=script_path, - arguments=[ - "--s3_input_bucket", - sagemaker_session.default_bucket(), - "--s3_input_key_prefix", - "spark-input", - "--s3_output_bucket", - sagemaker_session.default_bucket(), - "--s3_output_key_prefix", - "spark-output", - ], - ) - - step_pyspark_1 = ProcessingStep( - name="pyspark-process-1", - processor=pyspark_processor, - inputs=spark_run_args.inputs, - outputs=spark_run_args.outputs, - job_arguments=spark_run_args.arguments, - code=spark_run_args.code, - ) - - step_pyspark_2 = ProcessingStep( - name="pyspark-process-2", - depends_on=[step_pyspark_1], - processor=pyspark_processor, - inputs=spark_run_args.inputs, - outputs=spark_run_args.outputs, - job_arguments=spark_run_args.arguments, - code=spark_run_args.code, - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=[step_pyspark_1, step_pyspark_2], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - - pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] - response = pipeline.update(role) - update_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - update_arn, - ) - - execution = pipeline.start(parameters={}) - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", - execution.arn, - ) - - response = execution.describe() - assert response["PipelineArn"] == create_arn - - try: - execution.wait(delay=60) - except WaiterError: - pass - - execution_steps = execution.list_steps() - assert len(execution_steps) == 2 - time_stamp = {} - for execution_step in execution_steps: - name = execution_step["StepName"] - if name == "pyspark-process-1": - time_stamp[name] = execution_step["EndTime"] - else: - time_stamp[name] = execution_step["StartTime"] - assert time_stamp["pyspark-process-1"] < time_stamp["pyspark-process-2"] - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_one_step_data_wrangler_processing_pipeline(sagemaker_session, role, pipeline_name): - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") - - recipe_file_path = os.path.join(DATA_DIR, "workflow", "dummy_recipe.flow") - input_file_path = os.path.join(DATA_DIR, "workflow", "dummy_data.csv") - - output_name = "3f74973c-fd1e-4845-89f8-0dd400031be9.default" - output_content_type = "CSV" - output_config = {output_name: {"content_type": output_content_type}} - job_argument = [f"--output-config '{json.dumps(output_config)}'"] - - inputs = [ - ProcessingInput( - input_name="dummy_data.csv", - source=input_file_path, - destination="/opt/ml/processing/dummy_data.csv", - ) - ] - - output_s3_uri = f"s3://{sagemaker_session.default_bucket()}/output" - outputs = [ - ProcessingOutput( - output_name=output_name, - source="/opt/ml/processing/output", - destination=output_s3_uri, - s3_upload_mode="EndOfJob", - ) - ] - - data_wrangler_processor = DataWranglerProcessor( - role=role, - data_wrangler_flow_source=recipe_file_path, - instance_count=instance_count, - instance_type=instance_type, - sagemaker_session=sagemaker_session, - max_runtime_in_seconds=86400, - ) - - data_wrangler_step = ProcessingStep( - name="data-wrangler-step", - processor=data_wrangler_processor, - inputs=inputs, - outputs=outputs, - job_arguments=job_argument, - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count, instance_type], - steps=[data_wrangler_step], - sagemaker_session=sagemaker_session, - ) - - definition = json.loads(pipeline.definition()) - expected_image_uri = image_uris.retrieve( - "data-wrangler", region=sagemaker_session.boto_region_name - ) - assert len(definition["Steps"]) == 1 - assert definition["Steps"][0]["Arguments"]["AppSpecification"]["ImageUri"] is not None - assert definition["Steps"][0]["Arguments"]["AppSpecification"]["ImageUri"] == expected_image_uri - - assert definition["Steps"][0]["Arguments"]["ProcessingInputs"] is not None - processing_inputs = definition["Steps"][0]["Arguments"]["ProcessingInputs"] - assert len(processing_inputs) == 2 - for processing_input in processing_inputs: - if processing_input["InputName"] == "flow": - assert processing_input["S3Input"]["S3Uri"].endswith(".flow") - assert processing_input["S3Input"]["LocalPath"] == "/opt/ml/processing/flow" - elif processing_input["InputName"] == "dummy_data.csv": - assert processing_input["S3Input"]["S3Uri"].endswith(".csv") - assert processing_input["S3Input"]["LocalPath"] == "/opt/ml/processing/dummy_data.csv" - else: - raise AssertionError("Unknown input name") - assert definition["Steps"][0]["Arguments"]["ProcessingOutputConfig"] is not None - processing_outputs = definition["Steps"][0]["Arguments"]["ProcessingOutputConfig"]["Outputs"] - assert len(processing_outputs) == 1 - assert processing_outputs[0]["OutputName"] == output_name - assert processing_outputs[0]["S3Output"] is not None - assert processing_outputs[0]["S3Output"]["LocalPath"] == "/opt/ml/processing/output" - assert processing_outputs[0]["S3Output"]["S3Uri"] == output_s3_uri - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - - execution = pipeline.start() - response = execution.describe() - assert response["PipelineArn"] == create_arn - - try: - execution.wait(delay=60, max_attempts=10) - except WaiterError: - pass - - execution_steps = execution.list_steps() - assert len(execution_steps) == 1 - assert execution_steps[0]["StepName"] == "data-wrangler-step" - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_one_step_ingestion_pipeline( - sagemaker_session, feature_store_session, feature_definitions, role, pipeline_name -): - instance_count = ParameterInteger(name="InstanceCount", default_value=1) - instance_type = ParameterString(name="InstanceType", default_value="ml.m5.4xlarge") - - input_name = "features.csv" - input_file_path = os.path.join(DATA_DIR, "workflow", "features.csv") - input_data_uri = os.path.join( - "s3://", - sagemaker_session.default_bucket(), - "py-sdk-ingestion-test-input/features.csv", - ) - - with open(input_file_path, "r") as data: - body = data.read() - S3Uploader.upload_string_as_file_body( - body=body, - desired_s3_uri=input_data_uri, - sagemaker_session=sagemaker_session, - ) - - inputs = [ - ProcessingInput( - input_name=input_name, - source=input_data_uri, - destination="/opt/ml/processing/features.csv", - ) - ] - - feature_group_name = f"py-sdk-integ-fg-{int(time.time() * 10**7)}" - feature_group = FeatureGroup( - name=feature_group_name, - feature_definitions=feature_definitions, - sagemaker_session=feature_store_session, - ) - - ingestion_only_flow, output_name = generate_data_ingestion_flow_from_s3_input( - input_name, - input_data_uri, - s3_content_type="csv", - s3_has_header=True, - ) - - outputs = [ - ProcessingOutput( - output_name=output_name, - app_managed=True, - feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name), - ) - ] - - output_content_type = "CSV" - output_config = {output_name: {"content_type": output_content_type}} - job_argument = [f"--output-config '{json.dumps(output_config)}'"] - - temp_flow_path = "./ingestion.flow" - with cleanup_feature_group(feature_group): - json.dump(ingestion_only_flow, open(temp_flow_path, "w")) - - data_wrangler_processor = DataWranglerProcessor( - role=role, - data_wrangler_flow_source=temp_flow_path, - instance_count=instance_count, - instance_type=instance_type, - sagemaker_session=sagemaker_session, - max_runtime_in_seconds=86400, - ) - - data_wrangler_step = ProcessingStep( - name="ingestion-step", - processor=data_wrangler_processor, - inputs=inputs, - outputs=outputs, - job_arguments=job_argument, - ) - - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count, instance_type], - steps=[data_wrangler_step], - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - - offline_store_s3_uri = os.path.join( - "s3://", sagemaker_session.default_bucket(), feature_group_name - ) - feature_group.create( - s3_uri=offline_store_s3_uri, - record_identifier_name="f11", - event_time_feature_name="f10", - role_arn=role, - enable_online_store=False, - ) - _wait_for_feature_group_create(feature_group) - - execution = pipeline.start() - response = execution.describe() - assert response["PipelineArn"] == create_arn - - try: - execution.wait(delay=60, max_attempts=10) - except WaiterError: - pass - - execution_steps = execution.list_steps() - - assert len(execution_steps) == 1 - assert execution_steps[0]["StepName"] == "ingestion-step" - assert execution_steps[0]["StepStatus"] == "Succeeded" - - athena_query = feature_group.athena_query() - with timeout(minutes=10): - athena_query.run( - query_string=f'SELECT * FROM "{athena_query.table_name}"', - output_location=f"{offline_store_s3_uri}/query_results", - ) - athena_query.wait() - assert "SUCCEEDED" == athena_query.get_query_execution().get("QueryExecution").get( - "Status" - ).get("State") - - df = athena_query.as_dataframe() - assert pd.read_csv(input_file_path).shape[0] == df.shape[0] - finally: - try: - pipeline.delete() - except Exception as e: - print(f"Delete pipeline failed with error: {e}") - os.remove(temp_flow_path) - - -@pytest.mark.skip( - reason="""This test creates a long-running pipeline that - runs actual training jobs, processing jobs, etc. - All of the functionality in this test is covered in - shallow tests in this suite; as such, this is disabled - and only run as part of the 'lineage' test suite.""" -) -def test_end_to_end_pipeline_successful_execution( - sagemaker_session, region_name, role, pipeline_name, wait=False -): - model_package_group_name = f"{pipeline_name}ModelPackageGroup" - data_path = os.path.join(DATA_DIR, "workflow") - default_bucket = sagemaker_session.default_bucket() - - # download the input data - local_input_path = os.path.join(data_path, "abalone-dataset.csv") - s3 = sagemaker_session.boto_session.resource("s3") - s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( - "dataset/abalone-dataset.csv", local_input_path - ) - - # # upload the input data to our bucket - base_uri = f"s3://{default_bucket}/{pipeline_name}" - with open(local_input_path) as data: - body = data.read() - input_data_uri = S3Uploader.upload_string_as_file_body( - body=body, - desired_s3_uri=f"{base_uri}/abalone-dataset.csv", - sagemaker_session=sagemaker_session, - ) - - # download batch transform data - local_batch_path = os.path.join(data_path, "abalone-dataset-batch") - s3.Bucket(f"sagemaker-servicecatalog-seedcode-{region_name}").download_file( - "dataset/abalone-dataset-batch", local_batch_path - ) - - # upload the batch transform data - with open(local_batch_path) as data: - body = data.read() - batch_data_uri = S3Uploader.upload_string_as_file_body( - body=body, - desired_s3_uri=f"{base_uri}/abalone-dataset-batch", - sagemaker_session=sagemaker_session, - ) - - # define parameters - processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) - processing_instance_type = ParameterString( - name="ProcessingInstanceType", default_value="ml.m5.xlarge" - ) - training_instance_type = ParameterString( - name="TrainingInstanceType", default_value="ml.m5.xlarge" - ) - model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="Approved") - input_data = ParameterString( - name="InputData", - default_value=input_data_uri, - ) - batch_data = ParameterString( - name="BatchData", - default_value=batch_data_uri, - ) - - # define processing step - framework_version = "0.23-1" - sklearn_processor = SKLearnProcessor( - framework_version=framework_version, - instance_type=processing_instance_type, - instance_count=processing_instance_count, - base_job_name=f"{pipeline_name}-process", - role=role, - sagemaker_session=sagemaker_session, - ) - step_process = ProcessingStep( - name="AbaloneProcess", - processor=sklearn_processor, - inputs=[ - ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), - ], - outputs=[ - ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), - ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), - ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), - ], - code=os.path.join(data_path, "abalone/preprocessing.py"), - ) - - # define training step - model_path = f"s3://{default_bucket}/{pipeline_name}Train" - image_uri = image_uris.retrieve( - framework="xgboost", - region=region_name, - version="1.0-1", - py_version="py3", - instance_type=training_instance_type, - ) - xgb_train = Estimator( - image_uri=image_uri, - instance_type=training_instance_type, - instance_count=1, - output_path=model_path, - role=role, - sagemaker_session=sagemaker_session, - ) - xgb_train.set_hyperparameters( - objective="reg:linear", - num_round=50, - max_depth=5, - eta=0.2, - gamma=4, - min_child_weight=6, - subsample=0.7, - silent=0, - ) - step_train = TrainingStep( - name="AbaloneTrain", - estimator=xgb_train, - inputs={ - "train": TrainingInput( - s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ - "train" - ].S3Output.S3Uri, - content_type="text/csv", - ), - "validation": TrainingInput( - s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ - "validation" - ].S3Output.S3Uri, - content_type="text/csv", - ), - }, - ) - - # define evaluation step - script_eval = ScriptProcessor( - image_uri=image_uri, - command=["python3"], - instance_type=processing_instance_type, - instance_count=1, - base_job_name=f"{pipeline_name}-eval", - role=role, - sagemaker_session=sagemaker_session, - ) - evaluation_report = PropertyFile( - name="EvaluationReport", output_name="evaluation", path="evaluation.json" - ) - step_eval = ProcessingStep( - name="AbaloneEval", - processor=script_eval, - inputs=[ - ProcessingInput( - source=step_train.properties.ModelArtifacts.S3ModelArtifacts, - destination="/opt/ml/processing/model", - ), - ProcessingInput( - source=step_process.properties.ProcessingOutputConfig.Outputs[ - "test" - ].S3Output.S3Uri, - destination="/opt/ml/processing/test", - ), - ], - outputs=[ - ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation"), - ], - code=os.path.join(data_path, "abalone/evaluation.py"), - property_files=[evaluation_report], - ) - - # define create model step - model = Model( - image_uri=image_uri, - model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, - sagemaker_session=sagemaker_session, - role=role, - ) - inputs = CreateModelInput( - instance_type="ml.m5.large", - accelerator_type="ml.eia1.medium", - ) - step_create_model = CreateModelStep( - name="AbaloneCreateModel", - model=model, - inputs=inputs, - ) - - # define transform step - transformer = Transformer( - model_name=step_create_model.properties.ModelName, - instance_type="ml.m5.xlarge", - instance_count=1, - output_path=f"s3://{default_bucket}/{pipeline_name}Transform", - sagemaker_session=sagemaker_session, - ) - step_transform = TransformStep( - name="AbaloneTransform", - transformer=transformer, - inputs=TransformInput(data=batch_data), - ) - - # define register model step - model_metrics = ModelMetrics( - model_statistics=MetricsSource( - s3_uri="{}/evaluation.json".format( - step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] - ), - content_type="application/json", - ) - ) - step_register = RegisterModel( - name="AbaloneRegisterModel", - estimator=xgb_train, - model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, - content_types=["text/csv"], - response_types=["text/csv"], - inference_instances=["ml.t2.medium", "ml.m5.xlarge"], - transform_instances=["ml.m5.xlarge"], - model_package_group_name=model_package_group_name, - approval_status=model_approval_status, - model_metrics=model_metrics, - ) - - # define condition step - cond_lte = ConditionLessThanOrEqualTo( - left=JsonGet( - step_name=step_eval.name, - property_file=evaluation_report, - json_path="regression_metrics.mse.value", - ), - right=20.0, - ) - - step_cond = ConditionStep( - name="AbaloneMSECond", - conditions=[cond_lte], - if_steps=[step_register, step_create_model, step_transform], - else_steps=[], - ) - - # define pipeline - pipeline = Pipeline( - name=pipeline_name, - parameters=[ - processing_instance_type, - processing_instance_count, - training_instance_type, - model_approval_status, - input_data, - batch_data, - ], - steps=[step_process, step_train, step_eval, step_cond], - sagemaker_session=sagemaker_session, - ) - - pipeline.create(role) - execution = pipeline.start() - execution_arn = execution.arn - - if wait: - execution.wait() - - return execution_arn - - -def _wait_for_feature_group_create(feature_group: FeatureGroup): - status = feature_group.describe().get("FeatureGroupStatus") - while status == "Creating": - print("Waiting for Feature Group Creation") - time.sleep(5) - status = feature_group.describe().get("FeatureGroupStatus") - if status != "Created": - print(feature_group.describe()) - raise RuntimeError(f"Failed to create feature group {feature_group.name}") - print(f"FeatureGroup {feature_group.name} successfully created.") - - -@pytest.fixture -def feature_definitions(): - return [ - FeatureDefinition(feature_name="f1", feature_type=FeatureTypeEnum.STRING), - FeatureDefinition(feature_name="f2", feature_type=FeatureTypeEnum.FRACTIONAL), - FeatureDefinition(feature_name="f3", feature_type=FeatureTypeEnum.FRACTIONAL), - FeatureDefinition(feature_name="f4", feature_type=FeatureTypeEnum.FRACTIONAL), - FeatureDefinition(feature_name="f5", feature_type=FeatureTypeEnum.FRACTIONAL), - FeatureDefinition(feature_name="f6", feature_type=FeatureTypeEnum.FRACTIONAL), - FeatureDefinition(feature_name="f7", feature_type=FeatureTypeEnum.FRACTIONAL), - FeatureDefinition(feature_name="f8", feature_type=FeatureTypeEnum.FRACTIONAL), - FeatureDefinition(feature_name="f9", feature_type=FeatureTypeEnum.INTEGRAL), - FeatureDefinition(feature_name="f10", feature_type=FeatureTypeEnum.FRACTIONAL), - FeatureDefinition(feature_name="f11", feature_type=FeatureTypeEnum.STRING), - ] - - -@contextmanager -def cleanup_feature_group(feature_group: FeatureGroup): - try: - yield - finally: - try: - feature_group.delete() - print("FeatureGroup cleaned up") - except Exception as e: - print(f"Delete FeatureGroup failed with error: {e}.") - pass - - -def test_large_pipeline(sagemaker_session, role, pipeline_name, region_name): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - - outputParam = CallbackOutput(output_name="output", output_type=CallbackOutputTypeEnum.String) - - callback_steps = [ - CallbackStep( - name=f"callback-step{count}", - sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", - inputs={"arg1": "foo"}, - outputs=[outputParam], - ) - for count in range(2000) - ] - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=callback_steps, - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - response = pipeline.describe() - assert len(json.loads(pipeline.describe()["PipelineDefinition"])["Steps"]) == 2000 - - pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] - response = pipeline.update(role) - update_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - update_arn, - ) - finally: - try: - pipeline.delete() - except Exception: - pass - - -def test_create_and_update_with_parallelism_config( - sagemaker_session, role, pipeline_name, region_name -): - instance_count = ParameterInteger(name="InstanceCount", default_value=2) - - outputParam = CallbackOutput(output_name="output", output_type=CallbackOutputTypeEnum.String) - - callback_steps = [ - CallbackStep( - name=f"callback-step{count}", - sqs_queue_url="https://sqs.us-east-2.amazonaws.com/123456789012/MyQueue", - inputs={"arg1": "foo"}, - outputs=[outputParam], - ) - for count in range(500) - ] - pipeline = Pipeline( - name=pipeline_name, - parameters=[instance_count], - steps=callback_steps, - sagemaker_session=sagemaker_session, - ) - - try: - response = pipeline.create(role, parallelism_config={"MaxParallelExecutionSteps": 50}) - create_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - create_arn, - ) - response = pipeline.describe() - assert response["ParallelismConfiguration"]["MaxParallelExecutionSteps"] == 50 - - pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)] - response = pipeline.update(role, parallelism_config={"MaxParallelExecutionSteps": 55}) - update_arn = response["PipelineArn"] - assert re.match( - rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", - update_arn, - ) - - response = pipeline.describe() - assert response["ParallelismConfiguration"]["MaxParallelExecutionSteps"] == 55 - - finally: - try: - pipeline.delete() - except Exception: - pass diff --git a/tests/scripts/run-notebook-test.sh b/tests/scripts/run-notebook-test.sh index b8a6e9bbf4..49fcebdaa2 100755 --- a/tests/scripts/run-notebook-test.sh +++ b/tests/scripts/run-notebook-test.sh @@ -81,8 +81,15 @@ echo "$LIFECYCLE_CONFIG_CONTENT" set -euo pipefail +# git doesn't work in codepipeline, use CODEBUILD_RESOLVED_SOURCE_VERSION to get commit id +codebuild_initiator="${CODEBUILD_INITIATOR:-0}" +if [ "${codebuild_initiator:0:12}" == "codepipeline" ]; then + COMMIT_ID="${CODEBUILD_RESOLVED_SOURCE_VERSION}" +else + COMMIT_ID=$(git rev-parse --short HEAD) +fi + ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) -COMMIT_ID=$(git rev-parse --short HEAD) LIFECYCLE_CONFIG_NAME="install-python-sdk-$COMMIT_ID" python setup.py sdist diff --git a/tests/unit/sagemaker/jumpstart/test_utils.py b/tests/unit/sagemaker/jumpstart/test_utils.py index 04eddced08..08ecafb7ac 100644 --- a/tests/unit/sagemaker/jumpstart/test_utils.py +++ b/tests/unit/sagemaker/jumpstart/test_utils.py @@ -20,6 +20,7 @@ ENV_VARIABLE_JUMPSTART_CONTENT_BUCKET_OVERRIDE, JUMPSTART_BUCKET_NAME_SET, JUMPSTART_REGION_NAME_SET, + JUMPSTART_RESOURCE_BASE_NAME, JumpStartScriptScope, ) from sagemaker.jumpstart.enums import JumpStartTag @@ -874,3 +875,23 @@ def make_deprecated_spec(*largs, **kwargs): "pytorch-eqa-bert-base-cased", "*", ) + + +def test_get_jumpstart_base_name_if_jumpstart_model(): + uris = [random_jumpstart_s3_uri("random_key") for _ in range(random.randint(1, 10))] + assert JUMPSTART_RESOURCE_BASE_NAME == utils.get_jumpstart_base_name_if_jumpstart_model(*uris) + + uris = ["s3://not-jumpstart-bucket/some-key" for _ in range(random.randint(0, 10))] + assert utils.get_jumpstart_base_name_if_jumpstart_model(*uris) is None + + uris = ["s3://not-jumpstart-bucket/some-key" for _ in range(random.randint(1, 10))] + [ + random_jumpstart_s3_uri("random_key") + ] + assert JUMPSTART_RESOURCE_BASE_NAME == utils.get_jumpstart_base_name_if_jumpstart_model(*uris) + + uris = ( + ["s3://not-jumpstart-bucket/some-key" for _ in range(random.randint(1, 10))] + + [random_jumpstart_s3_uri("random_key")] + + ["s3://not-jumpstart-bucket/some-key-2" for _ in range(random.randint(1, 10))] + ) + assert JUMPSTART_RESOURCE_BASE_NAME == utils.get_jumpstart_base_name_if_jumpstart_model(*uris) diff --git a/tests/unit/sagemaker/model/test_model.py b/tests/unit/sagemaker/model/test_model.py index 42effef480..8befff7c77 100644 --- a/tests/unit/sagemaker/model/test_model.py +++ b/tests/unit/sagemaker/model/test_model.py @@ -19,13 +19,15 @@ import sagemaker from sagemaker.model import FrameworkModel, Model from sagemaker.huggingface.model import HuggingFaceModel -from sagemaker.jumpstart.constants import JUMPSTART_BUCKET_NAME_SET +from sagemaker.jumpstart.constants import JUMPSTART_BUCKET_NAME_SET, JUMPSTART_RESOURCE_BASE_NAME from sagemaker.jumpstart.enums import JumpStartTag from sagemaker.mxnet.model import MXNetModel from sagemaker.pytorch.model import PyTorchModel from sagemaker.sklearn.model import SKLearnModel from sagemaker.tensorflow.model import TensorFlowModel from sagemaker.xgboost.model import XGBoostModel +from sagemaker.workflow.properties import Properties + MODEL_DATA = "s3://bucket/model.tar.gz" MODEL_IMAGE = "mi" @@ -42,7 +44,6 @@ BRANCH = "test-branch-git-config" COMMIT = "ae15c9d7d5b97ea95ea451e4662ee43da3401d73" ENTRY_POINT_INFERENCE = "inference.py" - SCRIPT_URI = "s3://codebucket/someprefix/sourcedir.tar.gz" IMAGE_URI = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:1.9.0-gpu-py38" @@ -71,6 +72,23 @@ def sagemaker_session(): return sms +@patch("shutil.rmtree", MagicMock()) +@patch("tarfile.open", MagicMock()) +@patch("os.listdir", MagicMock(return_value=[ENTRY_POINT_INFERENCE])) +def test_prepare_container_def_with_model_src_s3_returns_correct_url(sagemaker_session): + model = Model( + entry_point=ENTRY_POINT_INFERENCE, + role=ROLE, + sagemaker_session=sagemaker_session, + source_dir=SCRIPT_URI, + image_uri=MODEL_IMAGE, + model_data=Properties("Steps.MyStep"), + ) + container_def = model.prepare_container_def(INSTANCE_TYPE, "ml.eia.medium") + + assert container_def["Environment"]["SAGEMAKER_SUBMIT_DIRECTORY"] == SCRIPT_URI + + def test_prepare_container_def_with_model_data(): model = Model(MODEL_IMAGE) container_def = model.prepare_container_def(INSTANCE_TYPE, "ml.eia.medium") @@ -551,3 +569,93 @@ def test_all_framework_models_add_jumpstart_tags( sagemaker_session.create_model.reset_mock() sagemaker_session.endpoint_from_production_variants.reset_mock() + + +@patch("sagemaker.utils.repack_model") +def test_script_mode_model_uses_jumpstart_base_name(repack_model, sagemaker_session): + + jumpstart_source_dir = f"s3://{list(JUMPSTART_BUCKET_NAME_SET)[0]}/source_dirs/source.tar.gz" + t = Model( + entry_point=ENTRY_POINT_INFERENCE, + role=ROLE, + sagemaker_session=sagemaker_session, + source_dir=jumpstart_source_dir, + image_uri=IMAGE_URI, + model_data=MODEL_DATA, + ) + t.deploy(instance_type=INSTANCE_TYPE, initial_instance_count=INSTANCE_COUNT) + + assert sagemaker_session.create_model.call_args_list[0][0][0].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + + assert sagemaker_session.endpoint_from_production_variants.call_args_list[0].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + + sagemaker_session.create_model.reset_mock() + sagemaker_session.endpoint_from_production_variants.reset_mock() + + non_jumpstart_source_dir = "s3://blah/blah/blah" + t = Model( + entry_point=ENTRY_POINT_INFERENCE, + role=ROLE, + sagemaker_session=sagemaker_session, + source_dir=non_jumpstart_source_dir, + image_uri=IMAGE_URI, + model_data=MODEL_DATA, + ) + t.deploy(instance_type=INSTANCE_TYPE, initial_instance_count=INSTANCE_COUNT) + + assert not sagemaker_session.create_model.call_args_list[0][0][0].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + + assert not sagemaker_session.endpoint_from_production_variants.call_args_list[0][1][ + "name" + ].startswith(JUMPSTART_RESOURCE_BASE_NAME) + + +@patch("sagemaker.utils.repack_model") +@patch("sagemaker.fw_utils.tar_and_upload_dir") +def test_all_framework_models_add_jumpstart_base_name( + repack_model, tar_and_uload_dir, sagemaker_session +): + framework_model_classes_to_kwargs = { + PyTorchModel: {"framework_version": "1.5.0", "py_version": "py3"}, + TensorFlowModel: { + "framework_version": "2.3", + }, + HuggingFaceModel: { + "pytorch_version": "1.7.1", + "py_version": "py36", + "transformers_version": "4.6.1", + }, + MXNetModel: {"framework_version": "1.7.0", "py_version": "py3"}, + SKLearnModel: { + "framework_version": "0.23-1", + }, + XGBoostModel: { + "framework_version": "1.3-1", + }, + } + jumpstart_model_dir = f"s3://{list(JUMPSTART_BUCKET_NAME_SET)[0]}/model_dirs/model.tar.gz" + for framework_model_class, kwargs in framework_model_classes_to_kwargs.items(): + framework_model_class( + entry_point=ENTRY_POINT_INFERENCE, + role=ROLE, + sagemaker_session=sagemaker_session, + model_data=jumpstart_model_dir, + **kwargs, + ).deploy(instance_type="ml.m2.xlarge", initial_instance_count=INSTANCE_COUNT) + + assert sagemaker_session.create_model.call_args_list[0][0][0].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + + assert sagemaker_session.endpoint_from_production_variants.call_args_list[0].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + + sagemaker_session.create_model.reset_mock() + sagemaker_session.endpoint_from_production_variants.reset_mock() diff --git a/tests/unit/sagemaker/test_serializers.py b/tests/unit/sagemaker/test_serializers.py index d2e4b7ce46..6b70c600ca 100644 --- a/tests/unit/sagemaker/test_serializers.py +++ b/tests/unit/sagemaker/test_serializers.py @@ -28,6 +28,7 @@ SparseMatrixSerializer, JSONLinesSerializer, LibSVMSerializer, + DataSerializer, ) from tests.unit import DATA_DIR @@ -331,3 +332,26 @@ def test_libsvm_serializer_file_like(libsvm_serializer): libsvm_file.seek(0) result = libsvm_serializer.serialize(libsvm_file) assert result == validation_data + + +@pytest.fixture +def data_serializer(): + return DataSerializer() + + +def test_data_serializer_raw(data_serializer): + input_image_file_path = os.path.join(DATA_DIR, "", "cuteCat.jpg") + with open(input_image_file_path, "rb") as image: + input_image = image.read() + input_image_data = data_serializer.serialize(input_image) + validation_image_file_path = os.path.join(DATA_DIR, "", "cuteCat.raw") + validation_image_data = open(validation_image_file_path, "rb").read() + assert input_image_data == validation_image_data + + +def test_data_serializer_file_like(data_serializer): + input_image_file_path = os.path.join(DATA_DIR, "", "cuteCat.jpg") + validation_image_file_path = os.path.join(DATA_DIR, "", "cuteCat.raw") + input_image_data = data_serializer.serialize(input_image_file_path) + validation_image_data = open(validation_image_file_path, "rb").read() + assert input_image_data == validation_image_data diff --git a/tests/unit/sagemaker/workflow/test_step_collections.py b/tests/unit/sagemaker/workflow/test_step_collections.py index d2f1f07059..ea810796f4 100644 --- a/tests/unit/sagemaker/workflow/test_step_collections.py +++ b/tests/unit/sagemaker/workflow/test_step_collections.py @@ -457,7 +457,8 @@ def test_register_model_with_model_repack_with_estimator( assert len(request_dict["DependsOn"]) == 1 assert request_dict["DependsOn"][0] == "TestStep" arguments = request_dict["Arguments"] - repacker_job_name = arguments["HyperParameters"]["sagemaker_job_name"] + assert BUCKET in arguments["HyperParameters"]["sagemaker_submit_directory"] + arguments["HyperParameters"].pop("sagemaker_submit_directory") assert ordered(arguments) == ordered( { "AlgorithmSpecification": { @@ -472,12 +473,8 @@ def test_register_model_with_model_repack_with_estimator( "inference_script": '"dummy_script.py"', "dependencies": f'"{dummy_requirements}"', "model_archive": '"model.tar.gz"', - "sagemaker_submit_directory": '"s3://{}/{}/source/sourcedir.tar.gz"'.format( - BUCKET, repacker_job_name.replace('"', "") - ), "sagemaker_program": '"_repack_model.py"', "sagemaker_container_log_level": "20", - "sagemaker_job_name": repacker_job_name, "sagemaker_region": f'"{REGION}"', "source_dir": "null", }, @@ -585,7 +582,8 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift assert len(request_dict["DependsOn"]) == 1 assert request_dict["DependsOn"][0] == "TestStep" arguments = request_dict["Arguments"] - repacker_job_name = arguments["HyperParameters"]["sagemaker_job_name"] + assert BUCKET in arguments["HyperParameters"]["sagemaker_submit_directory"] + arguments["HyperParameters"].pop("sagemaker_submit_directory") assert ordered(arguments) == ordered( { "AlgorithmSpecification": { @@ -599,12 +597,8 @@ def test_register_model_with_model_repack_with_model(model, model_metrics, drift "HyperParameters": { "inference_script": '"dummy_script.py"', "model_archive": '"model.tar.gz"', - "sagemaker_submit_directory": '"s3://{}/{}/source/sourcedir.tar.gz"'.format( - BUCKET, repacker_job_name.replace('"', "") - ), "sagemaker_program": '"_repack_model.py"', "sagemaker_container_log_level": "20", - "sagemaker_job_name": repacker_job_name, "sagemaker_region": f'"{REGION}"', "dependencies": "null", "source_dir": "null", @@ -717,7 +711,8 @@ def test_register_model_with_model_repack_with_pipeline_model( assert len(request_dict["DependsOn"]) == 1 assert request_dict["DependsOn"][0] == "TestStep" arguments = request_dict["Arguments"] - repacker_job_name = arguments["HyperParameters"]["sagemaker_job_name"] + assert BUCKET in arguments["HyperParameters"]["sagemaker_submit_directory"] + arguments["HyperParameters"].pop("sagemaker_submit_directory") assert ordered(arguments) == ordered( { "AlgorithmSpecification": { @@ -732,12 +727,8 @@ def test_register_model_with_model_repack_with_pipeline_model( "dependencies": "null", "inference_script": '"dummy_script.py"', "model_archive": '"model.tar.gz"', - "sagemaker_submit_directory": '"s3://{}/{}/source/sourcedir.tar.gz"'.format( - BUCKET, repacker_job_name.replace('"', "") - ), "sagemaker_program": '"_repack_model.py"', "sagemaker_container_log_level": "20", - "sagemaker_job_name": repacker_job_name, "sagemaker_region": f'"{REGION}"', "source_dir": "null", }, @@ -917,7 +908,6 @@ def test_estimator_transformer_with_model_repack_with_estimator(estimator): arguments = request_dict["Arguments"] # pop out the dynamic generated fields arguments["HyperParameters"].pop("sagemaker_submit_directory") - arguments["HyperParameters"].pop("sagemaker_job_name") assert arguments == { "AlgorithmSpecification": { "TrainingInputMode": "File", diff --git a/tests/unit/sagemaker/workflow/test_steps.py b/tests/unit/sagemaker/workflow/test_steps.py index 674c715617..fd3bd7d0b9 100644 --- a/tests/unit/sagemaker/workflow/test_steps.py +++ b/tests/unit/sagemaker/workflow/test_steps.py @@ -399,7 +399,6 @@ def test_training_step_tensorflow(sagemaker_session): name="MyTrainingStep", estimator=estimator, inputs=inputs, cache_config=cache_config ) step_request = step.to_request() - step_request["Arguments"]["HyperParameters"].pop("sagemaker_job_name", None) step_request["Arguments"]["HyperParameters"].pop("sagemaker_program", None) step_request["Arguments"].pop("ProfilerRuleConfigurations", None) assert step_request == { diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 656d773914..35e022816b 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -24,7 +24,7 @@ from botocore.exceptions import ClientError from mock import ANY, MagicMock, Mock, patch from sagemaker.huggingface.estimator import HuggingFace -from sagemaker.jumpstart.constants import JUMPSTART_BUCKET_NAME_SET +from sagemaker.jumpstart.constants import JUMPSTART_BUCKET_NAME_SET, JUMPSTART_RESOURCE_BASE_NAME from sagemaker.jumpstart.enums import JumpStartTag import sagemaker.local @@ -3851,3 +3851,142 @@ def test_all_framework_estimators_add_jumpstart_tags( ] sagemaker_session.train.reset_mock() + + +@patch("time.time", return_value=TIME) +@patch("sagemaker.estimator.tar_and_upload_dir") +@patch("sagemaker.model.Model._upload_code") +def test_script_mode_estimator_uses_jumpstart_base_name_with_js_models( + patched_upload_code, patched_tar_and_upload_dir, sagemaker_session +): + patched_tar_and_upload_dir.return_value = UploadedCode( + s3_prefix="s3://%s/%s" % ("bucket", "key"), script_name="script_name" + ) + sagemaker_session.boto_region_name = REGION + + instance_type = "ml.p2.xlarge" + instance_count = 1 + + training_data_uri = "s3://bucket/mydata" + + source_dir = "s3://dsfsdfsd/sdfsdfs/sdfsd" + + generic_estimator = Estimator( + entry_point=SCRIPT_PATH, + role=ROLE, + region=REGION, + sagemaker_session=sagemaker_session, + instance_count=instance_count, + instance_type=instance_type, + source_dir=source_dir, + image_uri=IMAGE_URI, + model_uri=MODEL_DATA, + ) + generic_estimator.fit(training_data_uri) + + assert not sagemaker_session.train.call_args_list[0][1]["job_name"].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + sagemaker_session.reset_mock() + sagemaker_session.sagemaker_client.describe_training_job.return_value = { + "ModelArtifacts": {"S3ModelArtifacts": "some-uri"} + } + + inference_jumpstart_source_dir = ( + f"s3://{list(JUMPSTART_BUCKET_NAME_SET)[0]}/source_dirs/inference/source.tar.gz" + ) + + generic_estimator.deploy( + initial_instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + image_uri=IMAGE_URI, + source_dir=inference_jumpstart_source_dir, + entry_point="inference.py", + role=ROLE, + ) + + assert sagemaker_session.create_model.call_args_list[0][0][0].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + + assert sagemaker_session.endpoint_from_production_variants.call_args_list[0].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + + +@patch("time.time", return_value=TIME) +@patch("sagemaker.estimator.tar_and_upload_dir") +@patch("sagemaker.model.Model._upload_code") +@patch("sagemaker.utils.repack_model") +def test_all_framework_estimators_add_jumpstart_base_name( + patched_repack_model, patched_upload_code, patched_tar_and_upload_dir, sagemaker_session +): + + sagemaker_session.boto_region_name = REGION + sagemaker_session.sagemaker_client.describe_training_job.return_value = { + "ModelArtifacts": {"S3ModelArtifacts": "some-uri"} + } + + patched_tar_and_upload_dir.return_value = UploadedCode( + s3_prefix="s3://%s/%s" % ("bucket", "key"), script_name="script_name" + ) + + framework_estimator_classes_to_kwargs = { + PyTorch: { + "framework_version": "1.5.0", + "py_version": "py3", + "instance_type": "ml.p2.xlarge", + }, + TensorFlow: { + "framework_version": "2.3", + "py_version": "py37", + "instance_type": "ml.p2.xlarge", + }, + HuggingFace: { + "pytorch_version": "1.7.1", + "py_version": "py36", + "transformers_version": "4.6.1", + "instance_type": "ml.p2.xlarge", + }, + MXNet: {"framework_version": "1.7.0", "py_version": "py3", "instance_type": "ml.p2.xlarge"}, + SKLearn: {"framework_version": "0.23-1", "instance_type": "ml.m2.xlarge"}, + XGBoost: {"framework_version": "1.3-1", "instance_type": "ml.m2.xlarge"}, + } + jumpstart_model_uri = f"s3://{list(JUMPSTART_BUCKET_NAME_SET)[0]}/model_dirs/model.tar.gz" + jumpstart_model_uri_2 = f"s3://{list(JUMPSTART_BUCKET_NAME_SET)[1]}/model_dirs/model.tar.gz" + for framework_estimator_class, kwargs in framework_estimator_classes_to_kwargs.items(): + estimator = framework_estimator_class( + entry_point=ENTRY_POINT, + role=ROLE, + sagemaker_session=sagemaker_session, + model_uri=jumpstart_model_uri, + instance_count=INSTANCE_COUNT, + **kwargs, + ) + + estimator.fit() + + assert sagemaker_session.train.call_args_list[0][1]["job_name"].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + + estimator.deploy( + initial_instance_count=INSTANCE_COUNT, + instance_type=kwargs["instance_type"], + image_uri=IMAGE_URI, + source_dir=jumpstart_model_uri_2, + entry_point="inference.py", + role=ROLE, + ) + + assert sagemaker_session.create_model.call_args_list[0][0][0].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + + assert sagemaker_session.endpoint_from_production_variants.call_args_list[0].startswith( + JUMPSTART_RESOURCE_BASE_NAME + ) + + sagemaker_session.endpoint_from_production_variants.reset_mock() + sagemaker_session.create_model.reset_mock() + sagemaker_session.train.reset_mock()