feat: support customized timeout for model data download and inference container startup health check for Hosting Endpoints

chiui0x18 · chiui0x18 · commit bb3d0d2b6eef · 2022-10-26T18:01:10.000-07:00
This change also enables customization of ML instance storage volume size for Hosting Endpoints.
diff --git a/src/sagemaker/automl/automl.py b/src/sagemaker/automl/automl.py
@@ -350,6 +350,9 @@ def deploy(
         model_kms_key=None,
         predictor_cls=None,
         inference_response_keys=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
     ):
         """Deploy a candidate to a SageMaker Inference Pipeline.
 
@@ -396,6 +399,16 @@ def deploy(
                 function on the created endpoint name.
             inference_response_keys (list): List of keys for response content. The order of the
                 keys will dictate the content order in the response.
+            volume_size (int): The size, in GB, of the ML storage volume attached to individual
+                inference instance associated with the production variant. Currenly only Amazon EBS
+                gp2 storage volumes are supported.
+            model_data_download_timeout (int): The timeout value, in seconds, to download and extract
+                model data from Amazon S3 to the individual inference instance associated with this
+                production variant.
+            container_startup_health_check_timeout (int): The timeout value, in seconds, for your
+                inference container to pass health check by SageMaker Hosting. For more information
+                about health check see:
+                https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
 
         Returns:
             callable[string, sagemaker.session.Session] or ``None``:
@@ -421,6 +434,9 @@ def deploy(
             deserializer=deserializer,
             endpoint_name=endpoint_name,
             kms_key=model_kms_key,
+            volume_size=volume_size,
+            model_data_download_timeout=model_data_download_timeout,
+            container_startup_health_check_timeout=container_startup_health_check_timeout,
             tags=tags,
             wait=wait,
         )
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -1277,6 +1277,9 @@ def deploy(
         tags=None,
         serverless_inference_config=None,
         async_inference_config=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
         **kwargs,
     ):
         """Deploy the trained model to an Amazon SageMaker endpoint.
@@ -1344,6 +1347,16 @@ def deploy(
                 For more information about tags, see
                 https://boto3.amazonaws.com/v1/documentation\
                 /api/latest/reference/services/sagemaker.html#SageMaker.Client.add_tags
+            volume_size (int): The size, in GB, of the ML storage volume attached to individual
+                inference instance associated with the production variant. Currenly only Amazon EBS
+                gp2 storage volumes are supported.
+            model_data_download_timeout (int): The timeout value, in seconds, to download and extract
+                model data from Amazon S3 to the individual inference instance associated with this
+                production variant.
+            container_startup_health_check_timeout (int): The timeout value, in seconds, for your
+                inference container to pass health check by SageMaker Hosting. For more information
+                about health check see:
+                https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
             **kwargs: Passed to invocation of ``create_model()``.
                 Implementations may customize ``create_model()`` to accept
                 ``**kwargs`` to customize model creation during deploy.
@@ -1402,6 +1415,9 @@ def deploy(
             data_capture_config=data_capture_config,
             serverless_inference_config=serverless_inference_config,
             async_inference_config=async_inference_config,
+            volume_size=volume_size,
+            model_data_download_timeout=model_data_download_timeout,
+            container_startup_health_check_timeout=container_startup_health_check_timeout,
         )
 
     def register(
diff --git a/src/sagemaker/huggingface/model.py b/src/sagemaker/huggingface/model.py
@@ -206,6 +206,9 @@ def deploy(
         data_capture_config=None,
         async_inference_config=None,
         serverless_inference_config=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
         **kwargs,
     ):
         """Deploy this ``Model`` to an ``Endpoint`` and optionally return a ``Predictor``.
@@ -269,6 +272,16 @@ def deploy(
                 empty object passed through, will use pre-defined values in
                 ``ServerlessInferenceConfig`` class to deploy serverless endpoint. Deploy an
                 instance based endpoint if it's None. (default: None)
+            volume_size (int): The size, in GB, of the ML storage volume attached to individual
+                inference instance associated with the production variant. Currenly only Amazon EBS
+                gp2 storage volumes are supported.
+            model_data_download_timeout (int): The timeout value, in seconds, to download and extract
+                model data from Amazon S3 to the individual inference instance associated with this
+                production variant.
+            container_startup_health_check_timeout (int): The timeout value, in seconds, for your
+                inference container to pass health check by SageMaker Hosting. For more information
+                about health check see:
+                https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
         Raises:
              ValueError: If arguments combination check failed in these circumstances:
                 - If no role is specified or
@@ -301,6 +314,9 @@ def deploy(
             data_capture_config,
             async_inference_config,
             serverless_inference_config,
+            volume_size=volume_size,
+            model_data_download_timeout=model_data_download_timeout,
+            container_startup_health_check_timeout=container_startup_health_check_timeout,
         )
 
     def register(
diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py
@@ -1029,6 +1029,9 @@ def deploy(
         data_capture_config=None,
         async_inference_config=None,
         serverless_inference_config=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
         **kwargs,
     ):
         """Deploy this ``Model`` to an ``Endpoint`` and optionally return a ``Predictor``.
@@ -1092,6 +1095,16 @@ def deploy(
                 empty object passed through, will use pre-defined values in
                 ``ServerlessInferenceConfig`` class to deploy serverless endpoint. Deploy an
                 instance based endpoint if it's None. (default: None)
+            volume_size (int): The size, in GB, of the ML storage volume attached to individual
+                inference instance associated with the production variant. Currenly only Amazon EBS
+                gp2 storage volumes are supported.
+            model_data_download_timeout (int): The timeout value, in seconds, to download and extract
+                model data from Amazon S3 to the individual inference instance associated with this
+                production variant.
+            container_startup_health_check_timeout (int): The timeout value, in seconds, for your
+                inference container to pass health check by SageMaker Hosting. For more information
+                about health check see:
+                https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
         Raises:
              ValueError: If arguments combination check failed in these circumstances:
                 - If no role is specified or
@@ -1155,6 +1168,9 @@ def deploy(
             initial_instance_count,
             accelerator_type=accelerator_type,
             serverless_inference_config=serverless_inference_config_dict,
+            volume_size=volume_size,
+            model_data_download_timeout=model_data_download_timeout,
+            container_startup_health_check_timeout=container_startup_health_check_timeout,
         )
         if endpoint_name:
             self.endpoint_name = endpoint_name
diff --git a/src/sagemaker/pipeline.py b/src/sagemaker/pipeline.py
@@ -122,6 +122,9 @@ def deploy(
         update_endpoint=False,
         data_capture_config=None,
         kms_key=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
     ):
         """Deploy the ``Model`` to an ``Endpoint``.
 
@@ -170,6 +173,16 @@ def deploy(
             kms_key (str): The ARN, Key ID or Alias of the KMS key that is used to
                 encrypt the data on the storage volume attached to the instance hosting
                 the endpoint.
+            volume_size (int): The size, in GB, of the ML storage volume attached to individual
+                inference instance associated with the production variant. Currenly only Amazon EBS
+                gp2 storage volumes are supported.
+            model_data_download_timeout (int): The timeout value, in seconds, to download and extract
+                model data from Amazon S3 to the individual inference instance associated with this
+                production variant.
+            container_startup_health_check_timeout (int): The timeout value, in seconds, for your
+                inference container to pass health check by SageMaker Hosting. For more information
+                about health check see:
+                https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
 
         Returns:
             callable[string, sagemaker.session.Session] or None: Invocation of
@@ -191,7 +204,10 @@ def deploy(
         )
 
         production_variant = sagemaker.production_variant(
-            self.name, instance_type, initial_instance_count
+            self.name, instance_type, initial_instance_count,
+            volume_size=volume_size,
+            model_data_download_timeout=model_data_download_timeout,
+            container_startup_health_check_timeout=container_startup_health_check_timeout,
         )
         self.endpoint_name = endpoint_name or self.name
 
@@ -208,6 +224,9 @@ def deploy(
                 tags=tags,
                 kms_key=kms_key,
                 data_capture_config_dict=data_capture_config_dict,
+                volume_size=volume_size,
+                model_data_download_timeout=model_data_download_timeout,
+                container_startup_health_check_timeout=container_startup_health_check_timeout,
             )
             self.sagemaker_session.update_endpoint(
                 self.endpoint_name, endpoint_config_name, wait=wait
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -2981,6 +2981,9 @@ def create_endpoint_config(
         tags=None,
         kms_key=None,
         data_capture_config_dict=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
     ):
         """Create an Amazon SageMaker endpoint configuration.
 
@@ -3004,6 +3007,16 @@ def create_endpoint_config(
                 attached to the instance hosting the endpoint.
             data_capture_config_dict (dict): Specifies configuration related to Endpoint data
                 capture for use with Amazon SageMaker Model Monitoring. Default: None.
+            volume_size (int): The size, in GB, of the ML storage volume attached to individual
+                inference instance associated with the production variant. Currenly only Amazon EBS
+                gp2 storage volumes are supported.
+            model_data_download_timeout (int): The timeout value, in seconds, to download and extract
+                model data from Amazon S3 to the individual inference instance associated with this
+                production variant.
+            container_startup_health_check_timeout (int): The timeout value, in seconds, for your
+                inference container to pass health check by SageMaker Hosting. For more information
+                about health check see:
+                https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
 
         Example:
             >>> tags = [{'Key': 'tagname', 'Value': 'tagvalue'}]
@@ -3025,6 +3038,9 @@ def create_endpoint_config(
                     instance_type,
                     initial_instance_count,
                     accelerator_type=accelerator_type,
+                    volume_size=volume_size,
+                    model_data_download_timeout=model_data_download_timeout,
+                    container_startup_health_check_timeout=container_startup_health_check_timeout,
                 )
             ],
         }
@@ -4636,6 +4652,9 @@ def production_variant(
     initial_weight=1,
     accelerator_type=None,
     serverless_inference_config=None,
+    volume_size=None,
+    model_data_download_timeout=None,
+    container_startup_health_check_timeout=None,
 ):
     """Create a production variant description suitable for use in a ``ProductionVariant`` list.
 
@@ -4657,7 +4676,16 @@ def production_variant(
         serverless_inference_config (dict): Specifies configuration dict related to serverless
             endpoint. The dict is converted from sagemaker.model_monitor.ServerlessInferenceConfig
             object (default: None)
-
+        volume_size (int): The size, in GB, of the ML storage volume attached to individual
+            inference instance associated with the production variant. Currenly only Amazon EBS
+            gp2 storage volumes are supported.
+        model_data_download_timeout (int): The timeout value, in seconds, to download and extract
+            model data from Amazon S3 to the individual inference instance associated with this
+            production variant.
+        container_startup_health_check_timeout (int): The timeout value, in seconds, for your
+            inference container to pass health check by SageMaker Hosting. For more information
+            about health check see:
+            https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
     Returns:
         dict[str, str]: An SageMaker ``ProductionVariant`` description
     """
@@ -4676,6 +4704,12 @@ def production_variant(
         initial_instance_count = initial_instance_count or 1
         production_variant_configuration["InitialInstanceCount"] = initial_instance_count
         production_variant_configuration["InstanceType"] = instance_type
+        update_args(
+            production_variant_configuration,
+            VolumeSizeInGB=volume_size,
+            ModelDataDownloadTimeoutInSeconds=model_data_download_timeout,
+            ContainerStartupHealthCheckTimeoutInSeconds=container_startup_health_check_timeout,
+        )
 
     return production_variant_configuration
 
diff --git a/src/sagemaker/tensorflow/model.py b/src/sagemaker/tensorflow/model.py
@@ -320,6 +320,9 @@ def deploy(
         update_endpoint=None,
         async_inference_config=None,
         serverless_inference_config=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
     ):
         """Deploy a Tensorflow ``Model`` to a SageMaker ``Endpoint``."""
 
@@ -340,6 +343,9 @@ def deploy(
             data_capture_config=data_capture_config,
             async_inference_config=async_inference_config,
             serverless_inference_config=serverless_inference_config,
+            volume_size=volume_size,
+            model_data_download_timeout=model_data_download_timeout,
+            container_startup_health_check_timeout=container_startup_health_check_timeout,
             update_endpoint=update_endpoint,
         )
 
diff --git a/tests/unit/sagemaker/automl/test_auto_ml.py b/tests/unit/sagemaker/automl/test_auto_ml.py
@@ -596,6 +596,9 @@ def test_deploy_optional_args(candidate_estimator, sagemaker_session, candidate_
         deserializer=None,
         endpoint_name=JOB_NAME,
         kms_key=OUTPUT_KMS_KEY,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
         tags=TAGS,
         wait=False,
     )
diff --git a/tests/unit/sagemaker/model/test_deploy.py b/tests/unit/sagemaker/model/test_deploy.py
@@ -71,6 +71,9 @@ def test_deploy(name_from_base, prepare_container_def, production_variant, sagem
         INSTANCE_COUNT,
         accelerator_type=None,
         serverless_inference_config=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
     )
 
     sagemaker_session.create_model.assert_called_with(
@@ -120,6 +123,9 @@ def test_deploy_accelerator_type(
         INSTANCE_COUNT,
         accelerator_type=ACCELERATOR_TYPE,
         serverless_inference_config=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
     )
 
     sagemaker_session.endpoint_from_production_variants.assert_called_with(
@@ -363,6 +369,9 @@ def test_deploy_serverless_inference(production_variant, create_sagemaker_model,
         None,
         accelerator_type=None,
         serverless_inference_config=serverless_inference_config_dict,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
     )
 
     sagemaker_session.endpoint_from_production_variants.assert_called_with(
diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py
@@ -3169,6 +3169,9 @@ def test_generic_to_deploy_kms(create_model, sagemaker_session):
         data_capture_config=None,
         async_inference_config=None,
         serverless_inference_config=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
     )
 
 
diff --git a/tests/unit/test_pipeline_model.py b/tests/unit/test_pipeline_model.py
@@ -188,6 +188,9 @@ def test_deploy_update_endpoint(tfo, time, sagemaker_session):
         tags=None,
         kms_key=None,
         data_capture_config_dict=None,
+        volume_size=None,
+        model_data_download_timeout=None,
+        container_startup_health_check_timeout=None,
     )
     config_name = sagemaker_session.create_endpoint_config(
         name=model.name,

Original file line number	Diff line number	Diff line change
`@@ -3169,6 +3169,9 @@ def test_generic_to_deploy_kms(create_model, sagemaker_session):`
`3169`	`3169`	`data_capture_config=None,`
`3170`	`3170`	`async_inference_config=None,`
`3171`	`3171`	`serverless_inference_config=None,`
	`3172`	`+ volume_size=None,`
	`3173`	`+ model_data_download_timeout=None,`
	`3174`	`+ container_startup_health_check_timeout=None,`
`3172`	`3175`	`)`
`3173`	`3176`
`3174`	`3177`