aws
diff --git a/‎src/sagemaker/workflow/_utils.py
+10-4 b/‎src/sagemaker/workflow/_utils.py
+10-4
diff --git a/‎src/sagemaker/workflow/retry.py
+204 b/‎src/sagemaker/workflow/retry.py
+204
diff --git a/‎src/sagemaker/workflow/step_collections.py
+23 b/‎src/sagemaker/workflow/step_collections.py
+23
@@ -28,13 +28,14 @@
 from sagemaker.sklearn.estimator import SKLearn
 from sagemaker.workflow.entities import RequestType
 from sagemaker.workflow.properties import Properties
-from sagemaker.session import get_create_model_package_request
-from sagemaker.session import get_model_package_args
+from sagemaker.session import get_create_model_package_request, get_model_package_args
 from sagemaker.workflow.steps import (
     StepTypeEnum,
     TrainingStep,
     Step,
+    ConfigurableRetryStep,
 )
+from sagemaker.workflow.retry import RetryPolicy
 
 FRAMEWORK_VERSION = "0.23-1"
 INSTANCE_TYPE = "ml.m5.large"
@@ -60,6 +61,7 @@ def __init__(
         source_dir: str = None,
         dependencies: List = None,
         depends_on: Union[List[str], List[Step]] = None,
+        retry_policies: List[RetryPolicy] = None,
         subnets=None,
         security_group_ids=None,
         **kwargs,
@@ -126,6 +128,7 @@ def __init__(
                     This is not supported with "local code" in Local Mode.
             depends_on (List[str] or List[Step]): A list of step names or instances
                     this step depends on
+            retry_policies (List[RetryPolicy]): The list of retry policies for the current step
             subnets (list[str]): List of subnet ids. If not specified, the re-packing
                     job will be created without VPC config.
             security_group_ids (list[str]): List of security group ids. If not
@@ -178,6 +181,7 @@ def __init__(
             display_name=display_name,
             description=description,
             depends_on=depends_on,
+            retry_policies=retry_policies,
             estimator=repacker,
             inputs=inputs,
         )
@@ -259,7 +263,7 @@ def properties(self):
         return self._properties
 
 
-class _RegisterModelStep(Step):
+class _RegisterModelStep(ConfigurableRetryStep):
     """Register model step in workflow that creates a model package.
 
     Attributes:
@@ -302,6 +306,7 @@ def __init__(
         display_name: str = None,
         description=None,
         depends_on: Union[List[str], List[Step]] = None,
+        retry_policies: List[RetryPolicy] = None,
         tags=None,
         container_def_list=None,
         **kwargs,
@@ -339,10 +344,11 @@ def __init__(
             description (str): Model Package description (default: None).
             depends_on (List[str] or List[Step]): A list of step names or instances
                 this step depends on
+            retry_policies (List[RetryPolicy]): The list of retry policies for the current step
             **kwargs: additional arguments to `create_model`.
         """
         super(_RegisterModelStep, self).__init__(
-            name, display_name, description, StepTypeEnum.REGISTER_MODEL, depends_on
+            name, StepTypeEnum.REGISTER_MODEL, display_name, description, depends_on, retry_policies
         )
         self.estimator = estimator
         self.model_data = model_data
 
@@ -0,0 +1,204 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Pipeline parameters and conditions for workflow."""
+from __future__ import absolute_import
+
+from enum import Enum
+from typing import List
+import attr
+
+from sagemaker.workflow.entities import Entity, DefaultEnumMeta, RequestType
+
+
+DEFAULT_BACKOFF_RATE = 2.0
+DEFAULT_INTERVAL_SECONDS = 1
+MAX_ATTEMPTS_CAP = 20
+MAX_EXPIRE_AFTER_MIN = 14400
+
+
+class StepExceptionTypeEnum(Enum, metaclass=DefaultEnumMeta):
+    """Step ExceptionType enum."""
+
+    SERVICE_FAULT = "Step.SERVICE_FAULT"
+    THROTTLING = "Step.THROTTLING"
+
+
+class SageMakerJobExceptionTypeEnum(Enum, metaclass=DefaultEnumMeta):
+    """SageMaker Job ExceptionType enum."""
+
+    INTERNAL_ERROR = "SageMaker.JOB_INTERNAL_ERROR"
+    CAPACITY_ERROR = "SageMaker.CAPACITY_ERROR"
+    RESOURCE_LIMIT = "SageMaker.RESOURCE_LIMIT"
+
+
+@attr.s
+class RetryPolicy(Entity):
+    """RetryPolicy base class
+
+    Attributes:
+        backoff_rate (float): The multiplier by which the retry interval increases
+            during each attempt (default: 2.0)
+        interval_seconds (int): An integer that represents the number of seconds before the
+            first retry attempt (default: 1)
+        max_attempts (int): A positive integer that represents the maximum
+            number of retry attempts. (default: None)
+        expire_after_mins (int): A positive integer that represents the maximum minute
+            to expire any further retry attempt (default: None)
+    """
+
+    backoff_rate: float = attr.ib(default=DEFAULT_BACKOFF_RATE)
+    interval_seconds: int = attr.ib(default=DEFAULT_INTERVAL_SECONDS)
+    max_attempts: int = attr.ib(default=None)
+    expire_after_mins: int = attr.ib(default=None)
+
+    @backoff_rate.validator
+    def validate_backoff_rate(self, _, value):
+        """Validate the input back off rate type"""
+        if value:
+            assert value >= 0.0, "backoff_rate should be non-negative"
+
+    @interval_seconds.validator
+    def validate_interval_seconds(self, _, value):
+        """Validate the input interval seconds"""
+        if value:
+            assert value >= 0.0, "interval_seconds rate should be non-negative"
+
+    @max_attempts.validator
+    def validate_max_attempts(self, _, value):
+        """Validate the input max attempts"""
+        if value:
+            assert (
+                MAX_ATTEMPTS_CAP >= value >= 1
+            ), f"max_attempts must in range of (0, {MAX_ATTEMPTS_CAP}] attempts"
+
+    @expire_after_mins.validator
+    def validate_expire_after_mins(self, _, value):
+        """Validate expire after mins"""
+        if value:
+            assert (
+                MAX_EXPIRE_AFTER_MIN >= value >= 0
+            ), f"expire_after_mins must in range of (0, {MAX_EXPIRE_AFTER_MIN}] minutes"
+
+    def to_request(self) -> RequestType:
+        """Get the request structure for workflow service calls."""
+        if (self.max_attempts is None) == self.expire_after_mins is None:
+            raise ValueError("Only one of [max_attempts] and [expire_after_mins] can be given.")
+
+        request = {
+            "BackoffRate": self.backoff_rate,
+            "IntervalSeconds": self.interval_seconds,
+        }
+
+        if self.max_attempts:
+            request["MaxAttempts"] = self.max_attempts
+
+        if self.expire_after_mins:
+            request["ExpireAfterMin"] = self.expire_after_mins
+
+        return request
+
+
+class StepRetryPolicy(RetryPolicy):
+    """RetryPolicy for a retryable step. The pipeline service will retry
+
+        `sagemaker.workflow.retry.StepRetryExceptionTypeEnum.SERVICE_FAULT` and
+        `sagemaker.workflow.retry.StepRetryExceptionTypeEnum.THROTTLING` regardless of
+        pipeline step type by default. However, for step defined as retryable, you can override them
+        by specifying a StepRetryPolicy.
+
+    Attributes:
+        exception_types (List[StepExceptionTypeEnum]): the exception types to match for this policy
+        backoff_rate (float): The multiplier by which the retry interval increases
+            during each attempt (default: 2.0)
+        interval_seconds (int): An integer that represents the number of seconds before the
+            first retry attempt (default: 1)
+        max_attempts (int): A positive integer that represents the maximum
+            number of retry attempts. (default: None)
+        expire_after_mins (int): A positive integer that represents the maximum minute
+            to expire any further retry attempt (default: None)
+    """
+
+    def __init__(
+        self,
+        exception_types: List[StepExceptionTypeEnum],
+        backoff_rate: float = 2.0,
+        interval_seconds: int = 1,
+        max_attempts: int = None,
+        expire_after_mins: int = None,
+    ):
+        super().__init__(backoff_rate, interval_seconds, max_attempts, expire_after_mins)
+        for exception_type in exception_types:
+            if not isinstance(exception_type, StepExceptionTypeEnum):
+                raise ValueError(f"{exception_type} is not of StepExceptionTypeEnum.")
+        self.exception_types = exception_types
+
+    def to_request(self) -> RequestType:
+        """Gets the request structure for retry policy."""
+        request = super().to_request()
+        request["ExceptionType"] = [e.value for e in self.exception_types]
+        return request
+
+
+class SageMakerJobStepRetryPolicy(RetryPolicy):
+    """RetryPolicy for exception thrown by SageMaker Job.
+
+    Attributes:
+        exception_types (List[SageMakerJobExceptionTypeEnum]):
+            The SageMaker exception to match for this policy. The SageMaker exceptions
+            captured here are the exceptions thrown by synchronously
+            creating the job. For instance the resource limit exception.
+        failure_reason_types (List[SageMakerJobExceptionTypeEnum]): the SageMaker
+            failure reason types to match for this policy. The failure reason type
+            is presented in FailureReason field of the Describe response, it indicates
+            the runtime failure reason for a job.
+        backoff_rate (float): The multiplier by which the retry interval increases
+            during each attempt (default: 2.0)
+        interval_seconds (int): An integer that represents the number of seconds before the
+            first retry attempt (default: 1)
+        max_attempts (int): A positive integer that represents the maximum
+            number of retry attempts. (default: None)
+        expire_after_mins (int): A positive integer that represents the maximum minute
+            to expire any further retry attempt (default: None)
+    """
+
+    def __init__(
+        self,
+        exception_types: List[SageMakerJobExceptionTypeEnum] = None,
+        failure_reason_types: List[SageMakerJobExceptionTypeEnum] = None,
+        backoff_rate: float = 2.0,
+        interval_seconds: int = 1,
+        max_attempts: int = None,
+        expire_after_mins: int = None,
+    ):
+        super().__init__(backoff_rate, interval_seconds, max_attempts, expire_after_mins)
+
+        if not exception_types and not failure_reason_types:
+            raise ValueError(
+                "At least one of the [exception_types, failure_reason_types] needs to be given."
+            )
+
+        self.exception_type_list: List[SageMakerJobExceptionTypeEnum] = []
+        if exception_types:
+            self.exception_type_list += exception_types
+        if failure_reason_types:
+            self.exception_type_list += failure_reason_types
+
+        for exception_type in self.exception_type_list:
+            if not isinstance(exception_type, SageMakerJobExceptionTypeEnum):
+                raise ValueError(f"{exception_type} is not of SageMakerJobExceptionTypeEnum.")
+
+    def to_request(self) -> RequestType:
+        """Gets the request structure for retry policy."""
+        request = super().to_request()
+        request["ExceptionType"] = [e.value for e in self.exception_type_list]
+        return request
@@ -32,6 +32,7 @@
     _RegisterModelStep,
     _RepackModelStep,
 )
+from sagemaker.workflow.retry import RetryPolicy
 
 
 @attr.s
@@ -62,6 +63,8 @@ def __init__(
         estimator: EstimatorBase = None,
         model_data=None,
         depends_on: Union[List[str], List[Step]] = None,
+        repack_model_step_retry_policies: List[RetryPolicy] = None,
+        register_model_step_retry_policies: List[RetryPolicy] = None,
         model_package_group_name=None,
         model_metrics=None,
         approval_status=None,
@@ -87,6 +90,10 @@ def __init__(
                 job can be run or on which an endpoint can be deployed (default: None).
             depends_on (List[str] or List[Step]): The list of step names or step instances
                 the first step in the collection depends on
+            repack_model_step_retry_policies (List[RetryPolicy]): The list of retry policies
+                for the repack model step
+            register_model_step_retry_policies (List[RetryPolicy]): The list of retry policies
+                for register model step
             model_package_group_name (str): The Model Package Group name, exclusive to
                 `model_package_name`, using `model_package_group_name` makes the Model Package
                 versioned (default: None).
@@ -130,6 +137,7 @@ def __init__(
             repack_model_step = _RepackModelStep(
                 name=f"{name}RepackModel",
                 depends_on=depends_on,
+                retry_policies=repack_model_step_retry_policies,
                 sagemaker_session=estimator.sagemaker_session,
                 role=estimator.role,
                 model_data=model_data,
@@ -173,6 +181,7 @@ def __init__(
                     repack_model_step = _RepackModelStep(
                         name=f"{model_name}RepackModel",
                         depends_on=depends_on,
+                        retry_policies=repack_model_step_retry_policies,
                         sagemaker_session=sagemaker_session,
                         role=role,
                         model_data=model_entity.model_data,
@@ -216,6 +225,7 @@ def __init__(
             display_name=display_name,
             tags=tags,
             container_def_list=self.container_def_list,
+            retry_policies=register_model_step_retry_policies,
             **kwargs,
         )
         if not repack_model:
@@ -254,6 +264,10 @@ def __init__(
         tags=None,
         volume_kms_key=None,
         depends_on: Union[List[str], List[Step]] = None,
+        # step retry policies
+        repack_model_step_retry_policies: List[RetryPolicy] = None,
+        model_step_retry_policies: List[RetryPolicy] = None,
+        transform_step_retry_policies: List[RetryPolicy] = None,
         **kwargs,
     ):
         """Construct steps required for a Transformer step collection:
@@ -292,6 +306,12 @@ def __init__(
                 transform job (default: None).
             depends_on (List[str] or List[Step]): The list of step names or step instances
                 the first step in the collection depends on
+            repack_model_step_retry_policies (List[RetryPolicy]): The list of retry policies
+                for the repack model step
+            model_step_retry_policies (List[RetryPolicy]): The list of retry policies for
+                model step
+            transform_step_retry_policies (List[RetryPolicy]): The list of retry policies for
+                transform step
         """
         steps = []
         if "entry_point" in kwargs:
@@ -301,6 +321,7 @@ def __init__(
             repack_model_step = _RepackModelStep(
                 name=f"{name}RepackModel",
                 depends_on=depends_on,
+                retry_policies=repack_model_step_retry_policies,
                 sagemaker_session=estimator.sagemaker_session,
                 role=estimator.sagemaker_session,
                 model_data=model_data,
@@ -336,6 +357,7 @@ def predict_wrapper(endpoint, session):
             inputs=model_inputs,
             description=description,
             display_name=display_name,
+            retry_policies=model_step_retry_policies,
         )
         if "entry_point" not in kwargs and depends_on:
             # if the CreateModelStep is the first step in the collection
@@ -365,6 +387,7 @@ def predict_wrapper(endpoint, session):
             inputs=transform_inputs,
             description=description,
             display_name=display_name,
+            retry_policies=transform_step_retry_policies,
         )
         steps.append(transform_step)