Allow users to customize trial component display names for pipeline launched jobs

Zhankuil · Zhankuil · commit 44d29b9a6523 · 2022-07-12T01:45:54.000-07:00
diff --git a/doc/amazon_sagemaker_model_building_pipeline.rst b/doc/amazon_sagemaker_model_building_pipeline.rst
@@ -741,6 +741,8 @@ There are a number of properties for a pipeline execution that can only be resol
 - :class:`sagemaker.workflow.execution_variables.ExecutionVariables.PIPELINE_EXECUTION_ARN`: The execution ARN for an execution.
 - :class:`sagemaker.workflow.execution_variables.ExecutionVariables.PIPELINE_NAME`: The name of the pipeline.
 - :class:`sagemaker.workflow.execution_variables.ExecutionVariables.PIPELINE_ARN`: The ARN of the pipeline.
+- :class:`sagemaker.workflow.execution_variables.ExecutionVariables.TRAINING_JOB_NAME`: The name of the training job launched by the training step.
+- :class:`sagemaker.workflow.execution_variables.ExecutionVariables.PROCESSING_JOB_NAME`: The name of the processing job launched by the processing step.
 
 You can use these execution variables as you see fit. The following example uses the :code:`START_DATETIME` execution variable to construct a processing output path:
 
diff --git a/doc/workflows/pipelines/sagemaker.workflow.pipelines.rst b/doc/workflows/pipelines/sagemaker.workflow.pipelines.rst
@@ -52,7 +52,7 @@ Execution Variables
 .. autoclass:: sagemaker.workflow.execution_variables.ExecutionVariable
 
 .. autoclass:: sagemaker.workflow.execution_variables.ExecutionVariables
-    :members: START_DATETIME, CURRENT_DATETIME, PIPELINE_EXECUTION_ID, PIPELINE_EXECUTION_ARN, PIPELINE_NAME, PIPELINE_ARN
+    :members: START_DATETIME, CURRENT_DATETIME, PIPELINE_EXECUTION_ID, PIPELINE_EXECUTION_ARN, PIPELINE_NAME, PIPELINE_ARN, TRAINING_JOB_NAME, PROCESSING_JOB_NAME
 
 Functions
 ---------
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -1000,6 +1000,12 @@ def fit(
                 * If both `ExperimentName` and `TrialName` are not supplied the trial component
                 will be unassociated.
                 * `TrialComponentDisplayName` is used for display in Studio.
+                * Both `ExperimentName` and `TrialName` will be ignored if the Estimator instance
+                is built with :class:`~sagemaker.workflow.pipeline_context.PipelineSession`.
+                However, the value of `TrialComponentDisplayName` is honored for display in Studio.
+        Returns:
+            None or pipeline step arguments in case the Estimator instance is built with
+            :class:`~sagemaker.workflow.pipeline_context.PipelineSession`
         """
         self._prepare_for_training(job_name=job_name)
 
diff --git a/src/sagemaker/processing.py b/src/sagemaker/processing.py
@@ -173,9 +173,14 @@ def run(
                 * If both `ExperimentName` and `TrialName` are not supplied the trial component
                 will be unassociated.
                 * `TrialComponentDisplayName` is used for display in Studio.
+                * Both `ExperimentName` and `TrialName` will be ignored if the Processor instance
+                is built with :class:`~sagemaker.workflow.pipeline_context.PipelineSession`.
+                However, the value of `TrialComponentDisplayName` is honored for display in Studio.
             kms_key (str): The ARN of the KMS key that is used to encrypt the
                 user code file (default: None).
-
+        Returns:
+            None or pipeline step arguments in case the Processor instance is built with
+            :class:`~sagemaker.workflow.pipeline_context.PipelineSession`
         Raises:
             ValueError: if ``logs`` is True but ``wait`` is False.
         """
@@ -543,8 +548,14 @@ def run(
                 * If both `ExperimentName` and `TrialName` are not supplied the trial component
                 will be unassociated.
                 * `TrialComponentDisplayName` is used for display in Studio.
+                * Both `ExperimentName` and `TrialName` will be ignored if the Processor instance
+                is built with :class:`~sagemaker.workflow.pipeline_context.PipelineSession`.
+                However, the value of `TrialComponentDisplayName` is honored for display in Studio.
             kms_key (str): The ARN of the KMS key that is used to encrypt the
                 user code file (default: None).
+        Returns:
+            None or pipeline step arguments in case the Processor instance is built with
+            :class:`~sagemaker.workflow.pipeline_context.PipelineSession`
         """
         normalized_inputs, normalized_outputs = self._normalize_args(
             job_name=job_name,
@@ -1601,8 +1612,14 @@ def run(  # type: ignore[override]
                 * If both `ExperimentName` and `TrialName` are not supplied the trial component
                 will be unassociated.
                 * `TrialComponentDisplayName` is used for display in Studio.
+                * Both `ExperimentName` and `TrialName` will be ignored if the Processor instance
+                is built with :class:`~sagemaker.workflow.pipeline_context.PipelineSession`.
+                However, the value of `TrialComponentDisplayName` is honored for display in Studio.
             kms_key (str): The ARN of the KMS key that is used to encrypt the
                 user code file (default: None).
+        Returns:
+            None or pipeline step arguments in case the Processor instance is built with
+            :class:`~sagemaker.workflow.pipeline_context.PipelineSession`
         """
         s3_runproc_sh, inputs, job_name = self._pack_and_upload_code(
             code, source_dir, dependencies, git_config, job_name, inputs
diff --git a/src/sagemaker/transformer.py b/src/sagemaker/transformer.py
@@ -186,6 +186,9 @@ def transform(
                 * If both `ExperimentName` and `TrialName` are not supplied the trial component
                 will be unassociated.
                 * `TrialComponentDisplayName` is used for display in Studio.
+                * Both `ExperimentName` and `TrialName` will be ignored if the Transformer instance
+                is built with :class:`~sagemaker.workflow.pipeline_context.PipelineSession`.
+                However, the value of `TrialComponentDisplayName` is honored for display in Studio.
             model_client_config (dict[str, str]): Model configuration.
                 Dictionary contains two optional keys,
                 'InvocationsTimeoutInSeconds', and 'InvocationsMaxRetries'.
@@ -194,6 +197,11 @@ def transform(
                 (default: ``True``).
             logs (bool): Whether to show the logs produced by the job.
                 Only meaningful when wait is ``True`` (default: ``True``).
+            kms_key (str): The ARN of the KMS key that is used to encrypt the
+                user code file (default: None).
+        Returns:
+            None or pipeline step arguments in case the Transformer instance is built with
+            :class:`~sagemaker.workflow.pipeline_context.PipelineSession`
         """
         local_mode = self.sagemaker_session.local_mode
         if not local_mode and not is_pipeline_variable(data) and not data.startswith("s3://"):
diff --git a/src/sagemaker/workflow/execution_variables.py b/src/sagemaker/workflow/execution_variables.py
@@ -58,6 +58,8 @@ class ExecutionVariables:
     - ExecutionVariables.PIPELINE_ARN
     - ExecutionVariables.PIPELINE_EXECUTION_ID
     - ExecutionVariables.PIPELINE_EXECUTION_ARN
+    - ExecutionVariables.TRAINING_JOB_NAME
+    - ExecutionVariables.PROCESSING_JOB_NAME
     """
 
     START_DATETIME = ExecutionVariable("StartDateTime")
@@ -66,3 +68,5 @@ class ExecutionVariables:
     PIPELINE_ARN = ExecutionVariable("PipelineArn")
     PIPELINE_EXECUTION_ID = ExecutionVariable("PipelineExecutionId")
     PIPELINE_EXECUTION_ARN = ExecutionVariable("PipelineExecutionArn")
+    TRAINING_JOB_NAME = ExecutionVariable("TrainingJobName")
+    PROCESSING_JOB_NAME = ExecutionVariable("ProcessingJobName")
diff --git a/src/sagemaker/workflow/steps.py b/src/sagemaker/workflow/steps.py
@@ -429,7 +429,16 @@ def arguments(self) -> RequestType:
             request_dict["HyperParameters"].pop("sagemaker_job_name", None)
 
         request_dict.pop("TrainingJobName", None)
-        request_dict.pop("ExperimentConfig", None)
+
+        # only keep the trial component name
+        if request_dict.get("ExperimentConfig", {}).get("TrialComponentDisplayName"):
+            request_dict["ExperimentConfig"] = {
+                "TrialComponentDisplayName": request_dict["ExperimentConfig"][
+                    "TrialComponentDisplayName"
+                ]
+            }
+        else:
+            request_dict.pop("ExperimentConfig", None)
 
         return request_dict
 
@@ -660,7 +669,17 @@ def arguments(self) -> RequestType:
             )
 
         request_dict.pop("TransformJobName", None)
-        request_dict.pop("ExperimentConfig", None)
+
+        # only keep the trial component name
+        if request_dict.get("ExperimentConfig", {}).get("TrialComponentDisplayName"):
+            request_dict["ExperimentConfig"] = {
+                "TrialComponentDisplayName": request_dict["ExperimentConfig"][
+                    "TrialComponentDisplayName"
+                ]
+            }
+        else:
+            request_dict.pop("ExperimentConfig", None)
+
         return request_dict
 
     @property
@@ -808,7 +827,17 @@ def arguments(self) -> RequestType:
             request_dict = self.processor.sagemaker_session._get_process_request(**process_args)
 
         request_dict.pop("ProcessingJobName", None)
-        request_dict.pop("ExperimentConfig", None)
+
+        # only keep the trial component name
+        if request_dict.get("ExperimentConfig", {}).get("TrialComponentDisplayName"):
+            request_dict["ExperimentConfig"] = {
+                "TrialComponentDisplayName": request_dict["ExperimentConfig"][
+                    "TrialComponentDisplayName"
+                ]
+            }
+        else:
+            request_dict.pop("ExperimentConfig", None)
+
         return request_dict
 
     @property
diff --git a/tests/data/_repack_model.py b/tests/data/_repack_model.py
@@ -0,0 +1,110 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Repack model script for training jobs to inject entry points"""
+from __future__ import absolute_import
+
+import argparse
+import os
+import shutil
+import tarfile
+import tempfile
+
+# Repack Model
+# The following script is run via a training job which takes an existing model and a custom
+# entry point script as arguments. The script creates a new model archive with the custom
+# entry point in the "code" directory along with the existing model.  Subsequently, when the model
+# is unpacked for inference, the custom entry point will be used.
+# Reference: https://docs.aws.amazon.com/sagemaker/latest/dg/amazon-sagemaker-toolkits.html
+
+# distutils.dir_util.copy_tree works way better than the half-baked
+# shutil.copytree which bombs on previously existing target dirs...
+# alas ... https://bugs.python.org/issue10948
+# we'll go ahead and use the copy_tree function anyways because this
+# repacking is some short-lived hackery, right??
+from distutils.dir_util import copy_tree
+
+
+def repack(inference_script, model_archive, dependencies=None, source_dir=None):  # pragma: no cover
+    """Repack custom dependencies and code into an existing model TAR archive
+
+    Args:
+        inference_script (str): The path to the custom entry point.
+        model_archive (str): The name or path (e.g. s3 uri) of the model TAR archive.
+        dependencies (str): A space-delimited string of paths to custom dependencies.
+        source_dir (str): The path to a custom source directory.
+    """
+
+    # the data directory contains a model archive generated by a previous training job
+    data_directory = "/opt/ml/input/data/training"
+    model_path = os.path.join(data_directory, model_archive.split("/")[-1])
+
+    # create a temporary directory
+    with tempfile.TemporaryDirectory() as tmp:
+        local_path = os.path.join(tmp, "local.tar.gz")
+        # copy the previous training job's model archive to the temporary directory
+        shutil.copy2(model_path, local_path)
+        src_dir = os.path.join(tmp, "src")
+        # create the "code" directory which will contain the inference script
+        code_dir = os.path.join(src_dir, "code")
+        os.makedirs(code_dir)
+        # extract the contents of the previous training job's model archive to the "src"
+        # directory of this training job
+        with tarfile.open(name=local_path, mode="r:gz") as tf:
+            tf.extractall(path=src_dir)
+
+        if source_dir:
+            # copy /opt/ml/code to code/
+            if os.path.exists(code_dir):
+                shutil.rmtree(code_dir)
+            shutil.copytree("/opt/ml/code", code_dir)
+        else:
+            # copy the custom inference script to code/
+            entry_point = os.path.join("/opt/ml/code", inference_script)
+            shutil.copy2(entry_point, os.path.join(code_dir, inference_script))
+
+        # copy any dependencies to code/lib/
+        if dependencies:
+            for dependency in dependencies.split(" "):
+                actual_dependency_path = os.path.join("/opt/ml/code", dependency)
+                lib_dir = os.path.join(code_dir, "lib")
+                if not os.path.exists(lib_dir):
+                    os.mkdir(lib_dir)
+                if os.path.isfile(actual_dependency_path):
+                    shutil.copy2(actual_dependency_path, lib_dir)
+                else:
+                    if os.path.exists(lib_dir):
+                        shutil.rmtree(lib_dir)
+                    # a directory is in the dependencies. we have to copy
+                    # all of /opt/ml/code into the lib dir because the original directory
+                    # was flattened by the SDK training job upload..
+                    shutil.copytree("/opt/ml/code", lib_dir)
+                    break
+
+        # copy the "src" dir, which includes the previous training job's model and the
+        # custom inference script, to the output of this training job
+        copy_tree(src_dir, "/opt/ml/model")
+
+
+if __name__ == "__main__":  # pragma: no cover
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--inference_script", type=str, default="inference.py")
+    parser.add_argument("--dependencies", type=str, default=None)
+    parser.add_argument("--source_dir", type=str, default=None)
+    parser.add_argument("--model_archive", type=str, default="model.tar.gz")
+    args, extra = parser.parse_known_args()
+    repack(
+        inference_script=args.inference_script,
+        dependencies=args.dependencies,
+        source_dir=args.source_dir,
+        model_archive=args.model_archive,
+    )
diff --git a/tests/unit/sagemaker/workflow/test_processing_step.py b/tests/unit/sagemaker/workflow/test_processing_step.py
@@ -18,6 +18,8 @@
 import pytest
 import warnings
 
+from copy import deepcopy
+
 from sagemaker.estimator import Estimator
 from sagemaker.parameter import IntegerParameter
 from sagemaker.transformer import Transformer
@@ -268,7 +270,34 @@ def network_config():
     )
 
 
-def test_processing_step_with_processor(pipeline_session, processing_input):
+@pytest.mark.parametrize(
+    "experiment_config, expected_experiment_config",
+    [
+        (
+            {
+                "ExperimentName": "experiment-name",
+                "TrialName": "trial-name",
+                "TrialComponentDisplayName": "display-name",
+            },
+            {"TrialComponentDisplayName": "display-name"},
+        ),
+        (
+            {"TrialComponentDisplayName": "display-name"},
+            {"TrialComponentDisplayName": "display-name"},
+        ),
+        (
+            {
+                "ExperimentName": "experiment-name",
+                "TrialName": "trial-name",
+            },
+            None,
+        ),
+        (None, None),
+    ],
+)
+def test_processing_step_with_processor(
+    pipeline_session, processing_input, experiment_config, expected_experiment_config
+):
     custom_step1 = CustomStep("TestStep")
     custom_step2 = CustomStep("SecondTestStep")
     processor = Processor(
@@ -280,7 +309,7 @@ def test_processing_step_with_processor(pipeline_session, processing_input):
     )
 
     with warnings.catch_warnings(record=True) as w:
-        step_args = processor.run(inputs=processing_input)
+        step_args = processor.run(inputs=processing_input, experiment_config=experiment_config)
         assert len(w) == 1
         assert issubclass(w[-1].category, UserWarning)
         assert "Running within a PipelineSession" in str(w[-1].message)
@@ -307,13 +336,21 @@ def test_processing_step_with_processor(pipeline_session, processing_input):
         steps=[step, custom_step1, custom_step2],
         sagemaker_session=pipeline_session,
     )
+
+    expected_step_arguments = deepcopy(step_args.args)
+    if expected_experiment_config is None:
+        expected_step_arguments.pop("ExperimentConfig", None)
+    else:
+        expected_step_arguments["ExperimentConfig"] = expected_experiment_config
+    del expected_step_arguments["ProcessingJobName"]
+
     assert json.loads(pipeline.definition())["Steps"][0] == {
         "Name": "MyProcessingStep",
         "Description": "ProcessingStep description",
         "DisplayName": "MyProcessingStep",
         "Type": "Processing",
         "DependsOn": ["TestStep", "SecondTestStep"],
-        "Arguments": step_args.args,
+        "Arguments": expected_step_arguments,
         "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
         "PropertyFiles": [
             {
diff --git a/tests/unit/sagemaker/workflow/test_training_step.py b/tests/unit/sagemaker/workflow/test_training_step.py
diff --git a/tests/unit/sagemaker/workflow/test_transform_step.py b/tests/unit/sagemaker/workflow/test_transform_step.py