fix: support maps in step parameters (aws#2661)

jayatalr · ahsan-z-khan · EthanShouhanCheng · commit 459e3a969ab1 · 2022-01-11T13:26:16.000-08:00
Co-authored-by: Ahsan Khan &lt;ahsan.al.zaki@gmail.com&gt;
diff --git a/src/sagemaker/workflow/properties.py b/src/sagemaker/workflow/properties.py
@@ -77,6 +77,8 @@ def __init__(
                 for key, info in members.items():
                     if shapes.get(info["shape"], {}).get("type") == "list":
                         self.__dict__[key] = PropertiesList(f"{path}.{key}", info["shape"])
+                    elif Properties._shapes.get(info["shape"], {}).get("type") == "map":
+                        self.__dict__[key] = PropertiesMap(f"{path}.{key}", info["shape"])
                     else:
                         self.__dict__[key] = Properties(
                             f"{path}.{key}", info["shape"], service_name=service_name
@@ -122,6 +124,38 @@ def __getitem__(self, item: Union[int, str]):
         return self._items.get(item)
 
 
+class PropertiesMap(Properties):
+    """PropertiesMap for use in workflow expressions."""
+
+    def __init__(self, path: str, shape_name: str = None):
+        """Create a PropertiesMap instance representing the given shape.
+
+        Args:
+            path (str): The parent path of the PropertiesMap instance.
+            shape_name (str): The botocore sagemaker service model shape name.
+        """
+        super(PropertiesMap, self).__init__(path, shape_name)
+        self.shape_name = shape_name
+        self._items: Dict[Union[int, str], Properties] = dict()
+
+    def __getitem__(self, item: Union[int, str]):
+        """Populate the indexing item with a Property, for both lists and dictionaries.
+
+        Args:
+            item (Union[int, str]): The index of the item in sequence.
+        """
+        if item not in self._items.keys():
+            shape = Properties._shapes.get(self.shape_name)
+            member = shape["value"]["shape"]
+            if isinstance(item, str):
+                property_item = Properties(f"{self._path}['{item}']", member)
+            else:
+                property_item = Properties(f"{self._path}[{item}]", member)
+            self._items[item] = property_item
+
+        return self._items.get(item)
+
+
 @attr.s
 class PropertyFile(Expression):
     """Provides a property file struct.
diff --git a/tests/integ/test_workflow.py b/tests/integ/test_workflow.py
@@ -856,6 +856,148 @@ def test_one_step_callback_pipeline(sagemaker_session, role, pipeline_name, regi
             pass
 
 
+def test_steps_with_map_params_pipeline(
+    sagemaker_session, role, script_dir, pipeline_name, region_name, athena_dataset_definition
+):
+    instance_count = ParameterInteger(name="InstanceCount", default_value=2)
+    framework_version = "0.20.0"
+    instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
+    output_prefix = ParameterString(name="OutputPrefix", default_value="output")
+    input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv"
+
+    sklearn_processor = SKLearnProcessor(
+        framework_version=framework_version,
+        instance_type=instance_type,
+        instance_count=instance_count,
+        base_job_name="test-sklearn",
+        sagemaker_session=sagemaker_session,
+        role=role,
+    )
+    step_process = ProcessingStep(
+        name="my-process",
+        display_name="ProcessingStep",
+        description="description for Processing step",
+        processor=sklearn_processor,
+        inputs=[
+            ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
+            ProcessingInput(dataset_definition=athena_dataset_definition),
+        ],
+        outputs=[
+            ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
+            ProcessingOutput(
+                output_name="test_data",
+                source="/opt/ml/processing/test",
+                destination=Join(
+                    on="/",
+                    values=[
+                        "s3:/",
+                        sagemaker_session.default_bucket(),
+                        "test-sklearn",
+                        output_prefix,
+                        ExecutionVariables.PIPELINE_EXECUTION_ID,
+                    ],
+                ),
+            ),
+        ],
+        code=os.path.join(script_dir, "preprocessing.py"),
+    )
+
+    sklearn_train = SKLearn(
+        framework_version=framework_version,
+        entry_point=os.path.join(script_dir, "train.py"),
+        instance_type=instance_type,
+        sagemaker_session=sagemaker_session,
+        role=role,
+        hyperparameters={
+            "batch-size": 500,
+            "epochs": 5,
+        },
+    )
+    step_train = TrainingStep(
+        name="my-train",
+        display_name="TrainingStep",
+        description="description for Training step",
+        estimator=sklearn_train,
+        inputs=TrainingInput(
+            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
+                "train_data"
+            ].S3Output.S3Uri
+        ),
+    )
+
+    model = Model(
+        image_uri=sklearn_train.image_uri,
+        model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
+        sagemaker_session=sagemaker_session,
+        role=role,
+    )
+    model_inputs = CreateModelInput(
+        instance_type="ml.m5.large",
+        accelerator_type="ml.eia1.medium",
+    )
+    step_model = CreateModelStep(
+        name="my-model",
+        display_name="ModelStep",
+        description="description for Model step",
+        model=model,
+        inputs=model_inputs,
+    )
+
+    # Condition step for evaluating model quality and branching execution
+    cond_lte = ConditionGreaterThanOrEqualTo(
+        left=step_train.properties.HyperParameters["batch-size"],
+        right=6.0,
+    )
+
+    step_cond = ConditionStep(
+        name="CustomerChurnAccuracyCond",
+        conditions=[cond_lte],
+        if_steps=[],
+        else_steps=[step_model],
+    )
+
+    pipeline = Pipeline(
+        name=pipeline_name,
+        parameters=[instance_type, instance_count, output_prefix],
+        steps=[step_process, step_train, step_cond],
+        sagemaker_session=sagemaker_session,
+    )
+
+    definition = json.loads(pipeline.definition())
+    assert definition["Version"] == "2020-12-01"
+
+    steps = definition["Steps"]
+    assert len(steps) == 3
+    training_args = {}
+    condition_args = {}
+    for step in steps:
+        if step["Type"] == "Training":
+            training_args = step["Arguments"]
+        if step["Type"] == "Condition":
+            condition_args = step["Arguments"]
+
+    assert training_args["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"] == {
+        "Get": "Steps.my-process.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri"
+    }
+    assert condition_args["Conditions"][0]["LeftValue"] == {
+        "Get": "Steps.my-train.HyperParameters['batch-size']"
+    }
+
+    try:
+        response = pipeline.create(role)
+        create_arn = response["PipelineArn"]
+        assert re.match(
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
+            create_arn,
+        )
+
+    finally:
+        try:
+            pipeline.delete()
+        except Exception:
+            pass
+
+
 def test_two_step_callback_pipeline_with_output_reference(
     sagemaker_session, role, pipeline_name, region_name
 ):
diff --git a/tests/unit/sagemaker/workflow/test_properties.py b/tests/unit/sagemaker/workflow/test_properties.py
@@ -22,6 +22,7 @@ def test_properties_describe_training_job_response():
     for name in some_prop_names:
         assert name in prop.__dict__.keys()
     assert prop.CreationTime.expr == {"Get": "Steps.MyStep.CreationTime"}
+    assert prop.HyperParameters.expr == {"Get": "Steps.MyStep.HyperParameters"}
     assert prop.OutputDataConfig.S3OutputPath.expr == {
         "Get": "Steps.MyStep.OutputDataConfig.S3OutputPath"
     }
diff --git a/tests/unit/sagemaker/workflow/test_steps.py b/tests/unit/sagemaker/workflow/test_steps.py
@@ -226,6 +226,7 @@ def test_training_step_base_estimator(sagemaker_session):
         "CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},
     }
     assert step.properties.TrainingJobName.expr == {"Get": "Steps.MyTrainingStep.TrainingJobName"}
+    assert step.properties.HyperParameters.expr == {"Get": "Steps.MyTrainingStep.HyperParameters"}
 
 
 def test_training_step_tensorflow(sagemaker_session):

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ def test_properties_describe_training_job_response():`
`22`	`22`	`for name in some_prop_names:`
`23`	`23`	`assert name in prop.__dict__.keys()`
`24`	`24`	`assert prop.CreationTime.expr == {"Get": "Steps.MyStep.CreationTime"}`
	`25`	`+ assert prop.HyperParameters.expr == {"Get": "Steps.MyStep.HyperParameters"}`
`25`	`26`	`assert prop.OutputDataConfig.S3OutputPath.expr == {`
`26`	`27`	`"Get": "Steps.MyStep.OutputDataConfig.S3OutputPath"`
`27`	`28`	`}`
Original file line number	Diff line number	Diff line change
`@@ -226,6 +226,7 @@ def test_training_step_base_estimator(sagemaker_session):`
`226`	`226`	`"CacheConfig": {"Enabled": True, "ExpireAfter": "PT1H"},`
`227`	`227`	`}`
`228`	`228`	`assert step.properties.TrainingJobName.expr == {"Get": "Steps.MyTrainingStep.TrainingJobName"}`
	`229`	`+ assert step.properties.HyperParameters.expr == {"Get": "Steps.MyTrainingStep.HyperParameters"}`
`229`	`230`
`230`	`231`
`231`	`232`	`def test_training_step_tensorflow(sagemaker_session):`